Main code¶

In [ ]:
import os
import numpy as np
import scipy.io
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, r2_score
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from IPython.display import clear_output
import random
import sys

sklearn.linear_model.LinearRegression

sklearn.linear_model.Ridge

sklearn.linear_model.ElasticNet

sklearn.linear_model.LogisticRegression

Read and preprocess mat files¶

After reading and preprocessing the data in the next code chunk, the data from all cells is stored in a population dictionary. The dictionary's keys correspond to the cell names, and the values contain the data for each respective cell.

The values in the population dictionary are cell dictionaries with keys being 'axons', 'green_dFFMeanValues',and 'red_dFFMeanValues':

  • The value of 'axons', e.g., cell_data_dict['CL090_230515']['axons'] is a 1 dimensional numpy array, of which the length is the number of groups and the elements are 1 dimensional numpy arrays consisting of components belonging to the group.
  • The value of 'green_dFFMeanValues' is a 2 dimensional 3 by 49 numpy array (each cell has 3 rounds, and each round has 8 directions * 2 time frequencies * 3 space frequencies = 48 settings plus a extra period so in total there are 49 columns), of which the elements are still 2 dimensional numpy arrays with size being 10 by N (N is the number of components).
  • The value of 'red_dFFMeanValues' is similarly a 2 dimensional 3 by 49 numpy array, of which the elements are still 2 dimensional numpy arrays with size being 10 by 1 (only recording the data at the soma).
In [ ]:
root_path = "/content/drive/MyDrive/Fluorescence_Data/FluoData4Fitting_Average"

# Get a list of all the subdirectories: subfolders are viewed as cell names
cell_names = [f for f in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, f))]
# for cell in cell_names:
#     print(cell)

# Create a dictionary with default values
default_value = 0
cell_data_dict = {cell: default_value for cell in cell_names}
# print(cell_data_dict)

file_suffixes = ['green_Axon.mat', 'green_dFFMeanValues.mat', 'red_dFFMeanValues.mat']

for cell in cell_names:
    print(cell)
    file_names = [cell + suffix for suffix in file_suffixes]

    path_ = file_names[0] # green_Axon.mat
    path_ = os.path.join(root_path, cell, path_)
    mat_data = scipy.io.loadmat(path_)
    axons = mat_data['Axons'] # array containing nested arrays/sub-arrays
    # Squeeze the outer array
    axons = np.squeeze(axons, axis=0)
    for i in range(len(axons)):
        # Squeeze the inner array and convert the data type to 'int'
        axons[i] = np.squeeze(axons[i].astype(int), axis=0)
    # final axons' length is the number of groups with
    # each elements being a nested array of components

    path_ = file_names[1] # green_dFFMeanValues.mat
    path_ = os.path.join(root_path, cell, path_)
    mat_data = scipy.io.loadmat(path_)
    dFFMeanValues_green = mat_data['dFFMeanValues'] # 3 by 49

    path_ = file_names[2] # red_dFFMeanValues.mat
    path_ = os.path.join(root_path, cell, path_)
    mat_data = scipy.io.loadmat(path_)
    dFFMeanValues_red = mat_data['dFFMeanValues'] # 3 by 49

    cell_data_dict[cell] = {'axons': axons,
                'green_dFFMeanValues': dFFMeanValues_green,
                'red_dFFMeanValues': dFFMeanValues_red}

# Print keys and types
for key, value in cell_data_dict.items():
    print("-- * * * * * --")
    print(key, type(value))
    for key_, value_ in value.items():
        print(key_, type(value_))
print("-- * * * * * --")
CL090_230515
CL090_230518
CL083_230413
CL075_230303
-- * * * * * --
CL090_230515 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
CL090_230518 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
CL083_230413 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
CL075_230303 <class 'dict'>
axons <class 'numpy.ndarray'>
green_dFFMeanValues <class 'numpy.ndarray'>
red_dFFMeanValues <class 'numpy.ndarray'>
-- * * * * * --
In [ ]:
# print to see data type and size
print(cell_data_dict['CL090_230515'].keys())
print(type(cell_data_dict['CL090_230515']['axons']))
print(cell_data_dict['CL090_230515']['axons'].shape)
print(cell_data_dict['CL090_230515']['axons'][1].shape)
print(cell_data_dict['CL090_230515']['axons'][1].dtype)
print("--------------------------------")
print(type(cell_data_dict['CL090_230515']['green_dFFMeanValues']))
print(cell_data_dict['CL090_230515']['green_dFFMeanValues'].shape)
print(type(cell_data_dict['CL090_230515']['green_dFFMeanValues'][1,1]))
print(cell_data_dict['CL090_230515']['green_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['green_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['green_dFFMeanValues'][0,1].dtype)
print("--------------------------------")
print(type(cell_data_dict['CL090_230515']['red_dFFMeanValues']))
print(cell_data_dict['CL090_230515']['red_dFFMeanValues'].shape)
print(type(cell_data_dict['CL090_230515']['red_dFFMeanValues'][1,1]))
print(cell_data_dict['CL090_230515']['red_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['red_dFFMeanValues'][0,1].shape)
print(cell_data_dict['CL083_230413']['red_dFFMeanValues'][0,1].dtype)
dict_keys(['axons', 'green_dFFMeanValues', 'red_dFFMeanValues'])
<class 'numpy.ndarray'>
(25,)
(19,)
int64
--------------------------------
<class 'numpy.ndarray'>
(3, 49)
<class 'numpy.ndarray'>
(10, 281)
(10, 155)
float64
--------------------------------
<class 'numpy.ndarray'>
(3, 49)
<class 'numpy.ndarray'>
(10, 1)
(10, 2)
float64

Note:

  • Four cells: 'CL090_230515', 'CL090_230518', 'CL083_230413', 'CL075_230303'.

  • 'red_dFFMeanValues' and 'green_dFFMeanValues' have 49 columns,where the last column should be excluded. They are supposed to have 3 rows (3 rounds), but 'CL090_230518' only has 2 rows.

  • In 'CL083_230413', elements in 'red_dFFMeanValues' have 2 columns (10 × 2, should be 10 × 1), so 'CL083_230413' is not used.

Fit data to model¶

Functions¶

In [ ]:
def plot_comparison(y_test, y_pred, subtitle = ''):
    # Sort y_pred and y_test based on y_test
    sorted_indices = np.argsort(y_test)
    sorted_y_pred = y_pred[sorted_indices]
    sorted_y_test = y_test[sorted_indices]
    # Plot sorted_y_pred and sorted_y_test
    plt.plot(sorted_y_pred, label='Sorted Predictions')
    plt.plot(sorted_y_test, label='Sorted Ground Truth')
    plt.xlabel('Index')
    plt.ylabel('Value')
    plt.title(f'Comparison of Sorted Predictions and Sorted Ground Truth \n ({subtitle})')
    plt.legend()
    plt.savefig(f'Comparison ({subtitle}).png', bbox_inches='tight')
    # bbox_inches='tight' will adjust the figure's bounding box to fit all the content, ensuring that
    # the complete words are visible in the saved figure. Otherwise, the saved figure may not show the
    # complete words, e.g., for x-label, or for the long title.
    # The default setting can sometimes result in cutoff or clipped text. It tries to include the entire
    # figure within the saved image, but there may be cases where the default behavior is not sufficient
    # to capture all the content. The default behavior assumes the figure content fits within the predefined
    # margins and padding.
    plt.show()

Reorganize the data (vstack)¶

In [ ]:
# cell_data = cell_data_dict['CL090_230515']
cell_data = cell_data_dict['CL075_230303']

delete_small_group = True # delete groups (axons) with less than 3 components

data_green = cell_data['green_dFFMeanValues'][:,:-1] # exclude 49th column
data_red = cell_data['red_dFFMeanValues'][:,:-1] # exclude 49th column
data_axons = cell_data['axons']
# print(data_axons)
# print(type(data_axons),len(data_axons),data_axons)
if delete_small_group:
    data_axons = np.array([axons_ for axons_ in data_axons if len(axons_) >= 3])
# print(type(data_axons),len(data_axons),data_axons)

# vstack green data
stacked_green = np.empty((0, data_green[0,0].shape[1]))
# Enumerate the elements in the np array and vstack them
for index, value in np.ndenumerate(data_green):
    stacked_green = np.vstack((stacked_green, value))
print(stacked_green.shape, 48*3*10)

# group columns of green data
group_num = data_axons.shape[0]
group_satcked_green = np.zeros((stacked_green.shape[0], group_num))
for i, cols in enumerate(data_axons):
    group_satcked_green[:, i] = np.sum(stacked_green[:, cols-1], axis=1)
print(group_satcked_green.shape, data_axons.shape)

# vstack red data
stacked_red = np.empty((0, data_red[0,0].shape[1]))
# Enumerate the elements in the np array and vstack them
for index, value in np.ndenumerate(data_red):
    stacked_red = np.vstack((stacked_red, value))
print(stacked_red.shape, 48*3*10)

print(np.max(group_satcked_green), np.min(group_satcked_green))
print(np.max(stacked_red), np.min(stacked_red))
(1440, 80) 1440
(1440, 5) (5,)
(1440, 1) 1440
27.44283678657392 -9.505092954427656
0.4404603811981394 -0.20703030293882338
<ipython-input-118-ddad8385ad7f>:12: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  data_axons = np.array([axons_ for axons_ in data_axons if len(axons_) >= 3])

Linear regression¶

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)
x_train shape: (1368, 5)
y_train shape: (1368,)
x_test shape: (72, 5)
y_test shape: (72,)

Ordinary linear regression¶

Ordinary least squares Linear Regression.

Linear Regression fits a linear model with coefficients to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.

Fit and predict¶
In [ ]:
# Create a LinearRegression object
model = linear_model.LinearRegression()

# Fit the model on the training data
model.fit(x_train, y_train)
# Print the fitted coefficients
print("Fitted Coefficients:", model.coef_)
# Print the fitted intercept
print("Fitted Intercept:", model.intercept_)

# Predict on the test data
y_pred = model.predict(x_test)
Fitted Coefficients: [ 0.02750133  0.00644773  0.01610286 -0.00245954  0.00325725]
Fitted Intercept: 0.021384507405425024
Evaluate¶
In [ ]:
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Mean squared error: 0.0021013740360479637
Correlation coefficient: 0.5516491307623594
Coefficient of determination (R-squared score, R2 score): 0.30412235302891344
In [ ]:
plot_comparison(y_test, y_pred, 'Ordinary Linear Regression, Test Set')
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Mean squared error: 0.0016130542307865325
Correlation coefficient: 0.5303604736894671
Coefficient of determination (R-squared score, R2 score): 0.2812822320521161
In [ ]:
plot_comparison(y_train, y_pred_, 'Ordinary Linear Regression, Train Set')
In [ ]:
# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
Mean squared error: 1.3611111111111112
Correlation coefficient: 0.5192778681631646
Coefficient of determination (R-squared score, R2 score): 0.2679738562091504
Mean squared error: 1.1140350877192982
Correlation coefficient: 0.47374351663713143
Coefficient of determination (R-squared score, R2 score): 0.21872132035869019

Ridge linear regression¶

Linear least squares with l2 regularization.

Minimizes the objective function:

$$ ||y - Xw||^2_2 + \alpha ||w||^2_2 $$

This model solves a regression model where the loss function is the linear least squares function and regularization is given by the l2-norm.

Fit and predict¶
In [ ]:
# Create a Ridge Regression object
ridge_model = linear_model.Ridge(alpha=1.0)  # You can adjust the value of alpha as per your requirements

# Fit the model on the training data
ridge_model.fit(x_train, y_train)

# Print the fitted coefficients
print("Fitted Coefficients:", ridge_model.coef_)

# Print the fitted intercept
print("Fitted Intercept:", ridge_model.intercept_)

# Predict on the test data
y_pred = ridge_model.predict(x_test)
Fitted Coefficients: [ 0.02737809  0.00640785  0.01592334 -0.00242911  0.00326877]
Fitted Intercept: 0.02141557952594624
Evaluate¶
In [ ]:
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Ridge Linear Regression, Test Set')
Mean squared error: 0.0020998258646144365
Correlation coefficient: 0.5521413386329813
Coefficient of determination (R-squared score, R2 score): 0.30463503562410577
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = ridge_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Ridge Linear Regression, Train Set')
Mean squared error: 0.0016130590000993249
Correlation coefficient: 0.5303588713620216
Coefficient of determination (R-squared score, R2 score): 0.2812801070213643
In [ ]:
# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
Mean squared error: 1.3611111111111112
Correlation coefficient: 0.5192778681631646
Coefficient of determination (R-squared score, R2 score): 0.2679738562091504
Mean squared error: 1.1103801169590644
Correlation coefficient: 0.47591543732849056
Coefficient of determination (R-squared score, R2 score): 0.22128457061998064

ElasticNet linear regression¶

Linear regression with combined L1 and L2 priors as regularizer.

Minimizes the objective function:

$$ 1 / (2 * n_{samples}) * ||y - Xw||^2_2 + \alpha * l1_{ratio} * ||w||_1 + 0.5 * \alpha * (1 - l1_{ratio}) * ||w||^2_2 $$

If controlling the L1 and L2 penalty separately, that this is equivalent to:

$$ a * ||w||_1 + 0.5 * b * ||w||_2^2 $$

where: $\alpha = a + b$ and $l1_{ratio} = a / (a + b)$.

Fit and predict¶
In [ ]:
# Create an ElasticNet object
a = 0.004; b = 0.00
alpha = a + b; l1_ratio = a / (a + b)
elasticnet_model = linear_model.ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=100000)
# adjust the values of alpha and l1_ratio as per your requirements

# Fit the model on the training data
elasticnet_model.fit(x_train, y_train)

# Print the fitted coefficients
print("Fitted Coefficients:", elasticnet_model.coef_)

# Print the fitted intercept
print("Fitted Intercept:", elasticnet_model.intercept_)

# Predict on the test data
y_pred = elasticnet_model.predict(x_test)

# It is normal to encounter warning here, because our data is not linear enough
# and not normailized to guarantee a low error/residual, even though we set a very
# large max_iter. But the results is similar to that of ordinary and Ridge linear
# regression. Like alpha = 0 results in the same results as the ordinary linear
# rergession.
Fitted Coefficients: [0.00866444 0.         0.         0.00158259 0.0047032 ]
Fitted Intercept: 0.022844047405427618
Evaluate¶
In [ ]:
mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Elasticnet Linear Regression, Test Set')
Mean squared error: 0.0020020605564851465
Correlation coefficient: 0.6136634710485406
Coefficient of determination (R-squared score, R2 score): 0.33701037262235
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = elasticnet_model.predict(x_train)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Elasticnet Linear Regression, Train Set')
Mean squared error: 0.0016981988382008856
Correlation coefficient: 0.4970722873576219
Coefficient of determination (R-squared score, R2 score): 0.2433449197003772
In [ ]:
# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
Mean squared error: 1.3472222222222223
Correlation coefficient: 0.5463431384225381
Coefficient of determination (R-squared score, R2 score): 0.2754435107376284
Mean squared error: 1.1359649122807018
Correlation coefficient: 0.4511357285754583
Coefficient of determination (R-squared score, R2 score): 0.20334181879094793

Power-law regression¶

Mathematically, a power-law relationship can be expressed as:

$$ y = A X^C $$

Here, I modify it, shown as:

$$ y = A (X+B)^C + D $$

where, $X = \beta_1 x_1 + \beta_2 x_2 \dots + \beta_N x_N$. $X+D$ is a linear regression part. $A$, $B$, $C$, $D$, $\beta_1$, $\beta_2$, ...,$\beta_N$ are parameters to be determined.

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)
x_train shape: (1368, 5)
y_train shape: (1368,)
x_test shape: (72, 5)
y_test shape: (72,)

Fit, predict and evaluate (Example 1)¶

In [ ]:
exponent = 5
# Define the model function
def func(X, *params):
    A, B, D = params[:3]
    # Compute the weighted sum
    weighted_sum = np.sum(X * np.array(params[3:]), axis=1)
    base = weighted_sum + B
    power_result = np.power(base, exponent)
    return A * power_result + D

# Create a LinearRegression object
model = linear_model.LinearRegression()

# give the initial params using linear regression
# so that the params are within a reasonable range
model.fit(x_train, y_train)
# print("Fitted Coefficients:", model.coef_)
# print("Fitted Intercept:", model.intercept_)
num_features = x_train.shape[1]
# Assuming model.coef_ is the np array containing the coefficients
model_coefs = model.coef_
# Set negative elements to 0 using np.clip()
initial_params = [1, model.intercept_, 0] + list(model_coefs) # Initial parameter guesses
# model_coefs_clipped = np.clip(model_coefs, 0, np.inf)
# initial_params = [1, model.intercept_, 0] + list(model_coefs_clipped) # Initial parameter guesses, no need to clip -- they can be negative

# Set lower and upper bounds for the parameters
lower_bounds = [0, -np.inf, -np.inf] + [0] * num_features
upper_bounds = [np.inf, np.inf, np.inf] + [np.inf] * num_features

# Combine the lower and upper bounds into a 2-tuple of array_like
bounds = (lower_bounds, upper_bounds)

# Perform the curve fit with bounds
params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)
# params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, bounds=bounds, maxfev=1000000)

# Print the fitted parameters
print("Fitted Parameters:", params)

# predict on test
A, B, D = params[:3]
weighted_sum = np.sum(x_test * np.array(params[3:]), axis=1)
base = weighted_sum + B
abs_base = np.abs(base)
sign = np.sign(base)
power_result = np.power(abs_base, exponent)
y_pred = A * sign * power_result + D

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Power-Law Regression Exponent=5, Test Set')


# predict on train
A, B, D = params[:3]
weighted_sum = np.sum(x_train * np.array(params[3:]), axis=1)
base = weighted_sum + B
abs_base = np.abs(base)
sign = np.sign(base)
power_result = np.power(abs_base, exponent)
y_pred_ = A * sign * power_result + D

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Power-Law Regression (Exponent=5), Train Set')

# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Fitted Parameters: [ 1.18600565e+01  3.44121000e-01 -4.05562036e-02  2.77702818e-02
  2.47117908e-03  2.09451252e-02 -2.56362694e-03  3.66822262e-03]
Mean squared error: 0.0026229360598610057
Correlation coefficient: 0.47446924924518413
Coefficient of determination (R-squared score, R2 score): 0.13140519384906468
Mean squared error: 0.001563847057598201
Correlation coefficient: 0.550642481932762
Coefficient of determination (R-squared score, R2 score): 0.30320714257648085
---- ---- ----
Mean squared error: 1.7083333333333333
Correlation coefficient: 0.45098751419685795
Coefficient of determination (R-squared score, R2 score): 0.08123249299719892
Mean squared error: 1.1001461988304093
Correlation coefficient: 0.4832959693241634
Coefficient of determination (R-squared score, R2 score): 0.22846167135159368

Fit, predict and evaluate (Example 2)¶

In [ ]:
# compared with example 1, here only fit A and D.

exponent = 5

model = linear_model.LinearRegression()
model.fit(x_train, y_train)
# now we have: model.intercept_ and model.coef_
# Print the fitted coefficients
print("Fitted Coefficients:", model.coef_)
# Print the fitted intercept
print("Fitted Intercept:", model.intercept_)

# Define the model function
def func(X, A, D):
    # Compute the weighted sum
    weighted_sum = np.sum(X * np.array(model.coef_), axis=1)
    base = weighted_sum
    power_result = np.power(base, exponent)
    return A * power_result + D

initial_params = [1, 0]

# Perform the curve fit
params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)

# Print the fitted parameters
print("Fitted Parameters:", params)


# predict on test
A, D = params
y_pred = func(x_test, A, D)

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_test, y_pred, 'Power-Law Regression Exponent=5, only fit A and D, Test Set')

# predict on train
A, D = params
y_pred_ = func(x_train, A, D)

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Power-Law Regression Exponent=5, only fit A and D, Train Set')
Fitted Coefficients: [ 0.02750133  0.00644773  0.01610286 -0.00245954  0.00325725]
Fitted Intercept: 0.021384507405425024
Fitted Parameters: [4.05181437e+03 1.77237254e-02]
Mean squared error: 0.005299362470883888
Correlation coefficient: 0.2996981978301417
Coefficient of determination (R-squared score, R2 score): -0.7549031364359122
Mean squared error: 0.001978769231784402
Correlation coefficient: 0.3439957106418279
Coefficient of determination (R-squared score, R2 score): 0.1183330489399761

Fit and predict¶

In [ ]:
# generate irreducible fraction with an odd number as the denominator
# such numbers can work as the exponent for negative numbers and
# will be used as the parameter "C" in Power-law regression below:
# y = A * (B+b1*x1+b2*x2+...+bN*xN)**C + D

def gcd(a, b): # calculate the greatest common divisor of two numbers
    while b:
        a, b = b, a % b
    return a

def generate_irreducible_fraction(existing_fractions = []):
    while True:
        numerator = random.randint(1, 400)  # Random numerator
        denominator = random.randrange(1, 100, 2)  # Random odd denominator
        if gcd(numerator, denominator) == 1:  # Check if the fraction is irreducible
            fraction = (numerator, denominator)
            if fraction not in existing_fractions:  # Check if the fraction is not a duplicate
                return fraction

# Generate irreducible fraction numbers
N_faction = 40
upper_bound = 50
irreducible_fractions = []
while len(irreducible_fractions) < N_faction:
    fraction = generate_irreducible_fraction(irreducible_fractions)
    if fraction[0]/fraction[1] < upper_bound:
        irreducible_fractions.append(fraction)

# Sort the irreducible fractions
irreducible_fractions.sort(key=lambda f: f[0] / f[1])

# Print the irreducible fraction numbers
for numerator, denominator in irreducible_fractions:
    if random.random() <= 0.1:
        print(f"{numerator}/{denominator}")


# Extract the numerator and denominator values
indexes = range(1, len(irreducible_fractions) + 1)
values = [numerator / denominator for numerator, denominator in irreducible_fractions]

# Plot the irreducible fractions
plt.plot(values, 'o-')
plt.xlabel("Index")
plt.ylabel("Irreducible Fraction")
plt.title("Irreducible Fractions")
plt.show()

## if not use the above random generated irreducible_fractions, define it below.
irreducible_fractions = [(1,95), (30,43), (179,65), (5,1), (221,33), (219,23), (300,17), (73,3)]
158/99
63/17
87/11
390/43
In [ ]:
# # old code! The new one is in the next chunk.

# params_list = []
# for numerator, denominator in irreducible_fractions:
#     C1, C2 = numerator, denominator

#     # Define the model function
#     def func(X, *params):
#         A, B, D = params[:3]
#         # Compute the weighted sum
#         weighted_sum = np.sum(X * np.array(params[3:]), axis=1)
#         base = weighted_sum + B
#         abs_base = np.abs(base)
#         sign = np.sign(base)
#         power_result = np.power(abs_base, C1 / C2)
#         return A * sign * power_result + D

#     # Create a LinearRegression object
#     model = linear_model.LinearRegression()

#     # give the initial params using linear regression
#     # so that the params are within a reasonable range
#     model.fit(x_train, y_train)
#     # print("Fitted Coefficients:", model.coef_)
#     # print("Fitted Intercept:", model.intercept_)
#     num_features = x_train.shape[1]
#     initial_params = [1, model.intercept_, 0] + list(model.coef_) # Initial parameter guesses

#     # Perform the curve fit
#     params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)
#     params_list.append(params)

#     # Print the fitted parameters
#     # print("Fitted Parameters:", params)

# # predict on test
# y_predict_test_list = []
# for (numerator, denominator), params in zip(irreducible_fractions, params_list):
#     C1, C2 = numerator, denominator
#     A, B, D = params[:3]
#     weighted_sum = np.sum(x_test * np.array(params[3:]), axis=1)
#     base = weighted_sum + B
#     abs_base = np.abs(base)
#     sign = np.sign(base)
#     power_result = np.power(abs_base, C1 / C2)
#     y_pred = A * sign * power_result + D
#     y_predict_test_list.append(y_pred)

# # predict on train
# y_predict_train_list = []
# for (numerator, denominator), params in zip(irreducible_fractions, params_list):
#     C1, C2 = numerator, denominator
#     A, B, D = params[:3]
#     weighted_sum = np.sum(x_train * np.array(params[3:]), axis=1)
#     base = weighted_sum + B
#     abs_base = np.abs(base)
#     sign = np.sign(base)
#     power_result = np.power(abs_base, C1 / C2)
#     y_pred_ = A * sign * power_result + D
#     y_predict_train_list.append(y_pred_)
In [ ]:
# new code, a updated version for the code in last chunk.
# input and output data normalization is achieved (but it
# seems normalization is redundant).

class CurveFit_with_Normalization:
    def __init__(self, exponent_numerator=1, exponent_denominator=1, input_range=[1,2], output_range=[1,2]):
        self.input_min = input_range[0]
        self.input_max = input_range[1]
        self.output_min = output_range[0]
        self.output_max = output_range[1]
        self.input_scale = None
        self.input_shift = None
        self.output_scale = None
        self.output_shift = None
        self.exponent_numerator = exponent_numerator
        self.exponent_denominator = exponent_denominator
        self.linear_model = linear_model.LinearRegression()

    def fit(self, X, y):
        # Normalize the input and output data
        self.input_scale = (self.input_max - self.input_min) / (np.max(X) - np.min(X))
        self.input_shift = self.input_min - np.min(X) * self.input_scale
        normalized_X = self.input_scale * X + self.input_shift

        self.output_scale = (self.output_max - self.output_min) / (np.max(y) - np.min(y))
        self.output_shift = self.output_min - np.min(y) * self.output_scale
        normalized_y = self.output_scale * y + self.output_shift

        def normalized_func_(X_normalize, *params):
            A, B, D = params[:3]
            # Compute the weighted sum
            weighted_sum = np.sum(X_normalize * np.array(params[3:]), axis=1)
            base = weighted_sum + B
            abs_base = np.abs(base)
            sign = np.sign(base)
            power_result = np.power(abs_base, self.exponent_numerator / self.exponent_denominator)
            return A * sign * power_result + D

        # Give the initial params using linear regression
        self.linear_model.fit(normalized_X, normalized_y)
        # print("Fitted Coefficients:", linear_model.coef_)
        # print("Fitted Intercept:", linear_model.intercept_)
        # num_features = X.shape[1]
        exponent = self.exponent_numerator / self.exponent_denominator
        initial_params = [1, self.linear_model.intercept_ / exponent + (1 - 1 / exponent), 0] + list(self.linear_model.coef_ /
                        exponent)  # Initial parameter guesses
        # initial_params = [1, 0, 0] + [1 / X.shape[1]] * (X.shape[1])

        # Perform the normalized curve fit
        normalized_params, params_covariance = curve_fit(normalized_func_, normalized_X, normalized_y,
                                  p0=initial_params, maxfev=100000000)

        # Store the fitted parameters
        self.normalized_fitted_params = normalized_params

    def predict(self, X):
        # Normalize the input data using the previously calculated scaling and shifting parameters
        normalized_X = self.input_scale * X + self.input_shift

        # Make predictions using the denormalized parameters
        y_pred = self.normalized_func(normalized_X, self.exponent_numerator,
                              self.exponent_denominator, *self.normalized_fitted_params)

        # Denormalize the predicted output
        y_pred = (y_pred - self.output_shift) / self.output_scale

        return y_pred

    @staticmethod
    def normalized_func(X_normalize, exponent_numerator, exponent_denominator, *params):
        A, B, D = params[:3]
        # Compute the weighted sum
        weighted_sum = np.sum(X_normalize * np.array(params[3:]), axis=1)
        base = weighted_sum + B
        abs_base = np.abs(base)
        sign = np.sign(base)
        power_result = np.power(abs_base, exponent_numerator / exponent_denominator)
        return A * sign * power_result + D

model_list = []
for numerator, denominator in irreducible_fractions:
    # Create an instance of NormalizedCurveFit
    model = CurveFit_with_Normalization(numerator, denominator, input_range=[1, 2], output_range=[1, 2])

    # Fit the model to your input and output data
    model.fit(x_train, y_train)

    model_list.append(model)
In [ ]:
# this is for the new version code in the last chunk

r2_score_test_list = []

# predict on test
y_predict_test_list = []
for model, (numerator, denominator) in zip(model_list, irreducible_fractions):
    y_pred = model.predict(x_test)

    print(f'-------- \n (numerator, denominator) is: ({numerator}, {denominator})')

    mse = mean_squared_error(y_test, y_pred)
    print("Mean squared error:", mse)

    correlation = np.corrcoef(y_pred, y_test)[0, 1]
    print("Correlation coefficient:", correlation)

    r_squared = r2_score(y_test, y_pred)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)

    r2_score_test_list.append(r_squared)

    y_predict_test_list.append(y_pred)

print("|||||||||||||||||||||||||||||||||||||")

r2_score_train_list = []

# predict on train
y_predict_train_list = []
for model, (numerator, denominator) in zip(model_list, irreducible_fractions):
    y_pred_ = model.predict(x_train)

    print(f'-------- \n (numerator, denominator) is: ({numerator}, {denominator})')

    mse = mean_squared_error(y_train, y_pred_)
    print("Mean squared error:", mse)

    # Calculate the correlation coefficient
    correlation = np.corrcoef(y_pred_, y_train)[0, 1]
    print("Correlation coefficient:", correlation)

    r_squared = r2_score(y_train, y_pred_)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)

    r2_score_train_list.append(r_squared)

    y_predict_train_list.append(y_pred_)


x = [numerator / denominator for (numerator, denominator) in irreducible_fractions]



# plot the r2 score curve over exponent
fig, ax = plt.subplots(figsize=(7.5, 5))  # Adjust the values as desired

# Plot the R-squared scores
ax.plot(x, r2_score_train_list, label='Train R-squared')
ax.plot(x, r2_score_test_list, label='Test R-squared')

# Set labels and title with font size
ax.set_xlabel('Exponent in Power Law', fontsize=14)
ax.set_ylabel('R-squared', fontsize=14)
ax.set_title('R-squared Scores', fontsize=16)

# Set tick label font size
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)

# Set legend
ax.legend(fontsize=12)

fig.savefig('Power_Law_r2_scores_plot.png')

# Display the plot
plt.show()
-------- 
 (numerator, denominator) is: (1, 95)
Mean squared error: 0.002100964640127741
Correlation coefficient: 0.5517910432357731
Coefficient of determination (R-squared score, R2 score): 0.3042579259753555
-------- 
 (numerator, denominator) is: (30, 43)
Mean squared error: 0.002101281865505407
Correlation coefficient: 0.5516808169870028
Coefficient of determination (R-squared score, R2 score): 0.30415287563896487
-------- 
 (numerator, denominator) is: (179, 65)
Mean squared error: 0.0025250507543628977
Correlation coefficient: 0.4832436056815392
Coefficient of determination (R-squared score, R2 score): 0.16382026841198005
-------- 
 (numerator, denominator) is: (5, 1)
Mean squared error: 0.0026225156812671405
Correlation coefficient: 0.47450623106504053
Coefficient of determination (R-squared score, R2 score): 0.13154440374778698
-------- 
 (numerator, denominator) is: (221, 33)
Mean squared error: 0.0026499746952767882
Correlation coefficient: 0.4722949272041147
Coefficient of determination (R-squared score, R2 score): 0.12245125149150604
-------- 
 (numerator, denominator) is: (219, 23)
Mean squared error: 0.0026738709969947676
Correlation coefficient: 0.47042871941472747
Coefficient of determination (R-squared score, R2 score): 0.1145379043550332
-------- 
 (numerator, denominator) is: (300, 17)
Mean squared error: 0.002698617492593064
Correlation coefficient: 0.4685731269373298
Coefficient of determination (R-squared score, R2 score): 0.10634301242607913
-------- 
 (numerator, denominator) is: (73, 3)
Mean squared error: 0.0027064284283282954
Correlation coefficient: 0.46800240398942017
Coefficient of determination (R-squared score, R2 score): 0.10375639267784165
|||||||||||||||||||||||||||||||||||||
-------- 
 (numerator, denominator) is: (1, 95)
Mean squared error: 0.0016134189436037255
Correlation coefficient: 0.530207263618693
Coefficient of determination (R-squared score, R2 score): 0.2811197294053285
-------- 
 (numerator, denominator) is: (30, 43)
Mean squared error: 0.0016131201665299491
Correlation coefficient: 0.5303327777597914
Coefficient of determination (R-squared score, R2 score): 0.28125285350461793
-------- 
 (numerator, denominator) is: (179, 65)
Mean squared error: 0.001564363067723666
Correlation coefficient: 0.5504336724006264
Coefficient of determination (R-squared score, R2 score): 0.3029772274015695
-------- 
 (numerator, denominator) is: (5, 1)
Mean squared error: 0.0015638470628401345
Correlation coefficient: 0.5506424830440508
Coefficient of determination (R-squared score, R2 score): 0.3032071402408676
-------- 
 (numerator, denominator) is: (221, 33)
Mean squared error: 0.0015638033309179715
Correlation coefficient: 0.550660172508143
Coefficient of determination (R-squared score, R2 score): 0.30322662558047075
-------- 
 (numerator, denominator) is: (219, 23)
Mean squared error: 0.0015637917943133492
Correlation coefficient: 0.5506648398692617
Coefficient of determination (R-squared score, R2 score): 0.30323176586810985
-------- 
 (numerator, denominator) is: (300, 17)
Mean squared error: 0.0015638010388691812
Correlation coefficient: 0.5506610998555096
Coefficient of determination (R-squared score, R2 score): 0.30322764683329595
-------- 
 (numerator, denominator) is: (73, 3)
Mean squared error: 0.0015638079818543363
Correlation coefficient: 0.5506582908609674
Coefficient of determination (R-squared score, R2 score): 0.30322455329391074
In [ ]:
# for model in model_list:
#     print(model.exponent_numerator, model.exponent_denominator)

Evaluate¶

In [ ]:
# both the old and new versions (in the last subsection "Fit and predict") of code
# share the same evaluate code in this subsectoon.

# Create and update multiple figures (test)
for y_pred, (numerator, denominator) in zip(y_predict_test_list, irreducible_fractions):
    plot_comparison(y_test, y_pred, f'Power-Law Regression Exponent={numerator} over {denominator}, Test Set')
    # here pay attention: cannot use / replace over in the name, otherwise cannot save the fig
    # because / cannot be in a file name.
    print(y_pred[0])
    clear_output(wait=True)  # Clear the previous output
-0.0068363855310673224
In [ ]:
# Create and update multiple figures (train)
for y_pred_, (numerator, denominator) in zip(y_predict_train_list, irreducible_fractions):
    plot_comparison(y_train, y_pred_, f'Power-Law Regression Exponent={numerator} over {denominator}, Train Set')
    print(y_pred_[0])
    clear_output(wait=True)  # Clear the previous output
-0.006264150549880837

Exponential regression¶

Let $B = (b_1, b_2, \ldots , b_N)$. $$ y = A \cdot e^{(b_1 \cdot x_1 + \ldots + b_N \cdot x_N)} + C $$

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)
x_train shape: (1368, 5)
y_train shape: (1368,)
x_test shape: (72, 5)
y_test shape: (72,)

Fit and predict¶

In [ ]:
# Define the model function
def func(X, *params):
    A, C = params[:2]
    return A * np.exp(np.sum(X * np.array(params[2:]), axis=1)) + C

# give the initial params using linear regression
# so that the params are within a reasonable range
num_features = x_train.shape[1]
initial_params = [np.mean(y_train), 0] + [0] * num_features # Initial parameter guesses

# Perform the curve fit
params, params_covariance = curve_fit(func, x_train, y_train, p0=initial_params, maxfev=1000000)

# Print the fitted parameters
print("Fitted Parameters:", params)

# predict on test
A, C = params[:2]
y_pred = A * np.exp(np.sum(x_test * np.array(params[2:]), axis=1)) + C

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

# predict on train
A, C = params[:2]
y_pred_ = A * np.exp(np.sum(x_train * np.array(params[2:]), axis=1)) + C

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

r2_score_train_list.append(r_squared)

# digitized results
print("---- ---- ----")
class_num = 16
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
print(y_pred)
print(y_pred_test_digital)
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
Fitted Parameters: [ 0.07779241 -0.06104678  0.29536653  0.02428094  0.22627202 -0.02735473
  0.03966839]
Mean squared error: 0.002726635527392822
Correlation coefficient: 0.466554602832389
Coefficient of determination (R-squared score, R2 score): 0.09706473840405994
Mean squared error: 0.0015638339543745966
Correlation coefficient: 0.5506477829720628
Coefficient of determination (R-squared score, R2 score): 0.3032129808920452
---- ---- ----
[-0.00689957 -0.00666178  0.01360607  0.00661662  0.00731099  0.02449375
  0.01326078  0.03334951  0.00512768  0.03277946  0.00727101  0.01555067
  0.12159638  0.01786017  0.01402178  0.02199172  0.01459452  0.01422314
  0.01935669 -0.00708446  0.03325915  0.0069828   0.00489899  0.01448913
  0.00301878  0.35765372  0.00477497  0.02670886  0.01733475 -0.01236122
  0.01620576  0.02950386 -0.00751631  0.0071461   0.0206256   0.00699687
  0.01942845  0.03431444  0.01141902  0.02229449  0.0212914  -0.01130567
  0.02393423  0.01708261  0.03761861  0.01389856  0.00729412  0.01502123
  0.00753736  0.10468944  0.02559665  0.01614424  0.02416139  0.020876
  0.01015729  0.00617441  0.01281935  0.00533284  0.01605946  0.02546893
  0.01672694  0.01258805  0.01521004 -0.00640577 -0.00895642  0.01126611
  0.02573762  0.01152167 -0.00115906  0.00837762  0.0270282   0.03119955]
[ 4  4  5  5  5  5  5  5  5  5  5  5  8  5  5  5  5  5  5  4  5  5  5  5
  5 13  5  5  5  4  5  5  4  5  5  5  5  5  5  5  5  4  5  5  6  5  5  5
  5  7  5  5  5  5  5  5  5  5  5  5  5  5  5  4  4  5  5  5  5  5  5  5]
Mean squared error: 1.7083333333333333
Correlation coefficient: 0.45098751419685795
Coefficient of determination (R-squared score, R2 score): 0.08123249299719892
Mean squared error: 1.1008771929824561
Correlation coefficient: 0.48299619891345313
Coefficient of determination (R-squared score, R2 score): 0.22794902129933559

Evaluate¶

In [ ]:
plot_comparison(y_test, y_pred, 'Exponential Regression, Test Set')
In [ ]:
plot_comparison(y_train, y_pred_, 'Exponential Regression, Train Set')

Logistic regression¶

Divide train and val datasets¶

In [ ]:
# independent data
x = group_satcked_green

class_num = 480

# dependent data (labels/targets)
y = np.squeeze(stacked_red)
# print(np.max(y), np.min(y))

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
# print(intervals)
# Digitize the array to get the indices of the intervals
y_train = np.digitize(y_train, intervals) - 1
y_test = np.digitize(y_test, intervals) - 1
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# to see unique elements (see if we have all 0, 1,..., class_num-1 classes, better close to all)
unique_elements = np.unique(y_train)
print("Unique elements:", unique_elements)
print("Number of unique elements:", len(unique_elements))
x_train shape: (1368, 5)
y_train shape: (1368,)
x_test shape: (72, 5)
y_test shape: (72,)
Unique elements: [ 65  71  72  75  79  80  85  88  91  94  95  99 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
 215 216 217 218 219 220 221 224 225 226 227 228 229 230 231 232 233 234
 235 236 237 238 239 240 241 242 244 245 248 250 252 255 256 257 259 260
 262 265 268 269 270 274 278 297 304 335 341 346 350 353 357 378 429 479]
Number of unique elements: 180

Fit and predict¶

'multinomial' (default option for multi-calss) achieves better performance than 'ovr'.

In [ ]:
# fit
model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, multi_class='multinomial')
# model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, multi_class='ovr')

fit_result = model.fit(x_train, y_train)
print(fit_result.intercept_.shape, fit_result.coef_.shape)
# print("Coefficients:", model.coef_[0,:])
# print("Intercept:", model.intercept_[0])
print('--- --- ---')

# predict
# Use the trained model to make predictions
y_pred = model.predict(x_test)
# Alternatively, you can get the predicted probabilities for each class
y_prob = model.predict_proba(x_test)

print('y_prob.shape:', y_prob.shape)
print(np.sum(y_prob, axis = 1))
# print(y_prob[0,:])

# Print the predicted class labels
print('y_pred:', y_pred)
print('y_test:', y_test)
print('y_pred shape:', y_pred.shape, 'y_test shape:', y_test.shape)
# Print the predicted probabilities
# print(y_prob)
(180,) (180, 5)
--- --- ---
y_prob.shape: (72, 180)
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
y_pred: [162 141 167 152 162 170 168 166 162 170 152 169 260 138 167 170 157 173
 162  99 166 152 162 163 162 378 152 190 140 162 162 166 162 162 190 140
 150 180 162 166 157 147 166 170 163 166 152 138 162 202 190 162 190 166
 174 190 171 162 190 170 162 162 190 140 147 162 158 138 162 158 199 184]
y_test: [153 188 169 125 156 138 190 188 168 182 167 158 261 179 183 152 126 170
 256   0 170 174 159 138 146 263 183 164 139 139 161 172 145 151 135 122
 141 181 173 166 188 125 172 307 193 155 156 180 151 275 143 177 172 162
 178 156 153 148 165 177 219 146 169 175 157 190 136  79 202 170 163 181]
y_pred shape: (72,) y_test shape: (72,)

Evaluate¶

Evaluate (normal)¶

In previous data division, I classify data into class_num (e.g., class_num = 160) intervals (histogram, by np.digitize). Here, evaluate the results with the same number of classes (e.g., class_num = 160).

In [ ]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# cm = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:") # y_test doesn't include all classes, so confusion matrix is less than num_class by num_class
# print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#           |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

mse = mean_squared_error(y_test, y_pred)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_test, y_pred)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)


# Sort y_pred and y_test based on y_test
plot_comparison(y_test, y_pred, 'Logistic Linear Regression, Test Set')
Accuracy: 0.013888888888888888
Mean squared error: 1318.5694444444443
Correlation coefficient: 0.5212639786009838
Coefficient of determination (R-squared score, R2 score): 0.20414204593680674
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = model.predict(x_train)
# Alternatively, you can get the predicted probabilities for each class
y_prob_ = model.predict_proba(x_train)

accuracy = accuracy_score(y_train, y_pred_)
print("Accuracy:", accuracy)

# cm = confusion_matrix(y_train, y_pred_)
# print("Confusion Matrix:")
# print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#              |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

mse = mean_squared_error(y_train, y_pred_)
print("Mean squared error:", mse)

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)

r_squared = r2_score(y_train, y_pred_)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)

plot_comparison(y_train, y_pred_, 'Logistic Linear Regression, Train Set')
Accuracy: 0.04751461988304093
Mean squared error: 1031.3523391812867
Correlation coefficient: 0.504515155991152
Coefficient of determination (R-squared score, R2 score): 0.16281542471269117
Evaluate (reduced)¶

The model is based on classifying data into class_num (e.g., class_num = 160) intervals (histogram, by np.digitize). Here, evaluate the results a smaller number of classes (e.g., reduced_class_num = 16), that is, for the example of class_num = 160 and reduced_class_num = 16, classes 0, 1, ..., 15 become one class, i.e., 0; ...; classes 144, 145, ..., 159 become one class, i.e., 15.

In [ ]:
print("---- ---- ----")
# Define the boundaries for digitization
reduced_class_num = 16
intervals = np.arange(0, class_num + 1, class_num / reduced_class_num)
print(intervals)

y_train_digital = np.digitize(y_train, intervals) - 1
y_test_digital = np.digitize(y_test, intervals) - 1
y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
y_pred_test_digital = np.digitize(y_pred, intervals) - 1
print(y_pred)
print(y_pred_test_digital)
mse = mean_squared_error(y_test_digital, y_pred_test_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_test_digital, y_pred_test_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
mse = mean_squared_error(y_train_digital, y_pred_train_digital)
print("Mean squared error:", mse)
correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
print("Correlation coefficient:", correlation)
r_squared = r2_score(y_train_digital, y_pred_train_digital)
print("Coefficient of determination (R-squared score, R2 score):", r_squared)
---- ---- ----
[  0.  30.  60.  90. 120. 150. 180. 210. 240. 270. 300. 330. 360. 390.
 420. 450. 480.]
[162 141 167 152 162 170 168 166 162 170 152 169 260 138 167 170 157 173
 162  99 166 152 162 163 162 378 152 190 140 162 162 166 162 162 190 140
 150 180 162 166 157 147 166 170 163 166 152 138 162 202 190 162 190 166
 174 190 171 162 190 170 162 162 190 140 147 162 158 138 162 158 199 184]
[ 5  4  5  5  5  5  5  5  5  5  5  5  8  4  5  5  5  5  5  3  5  5  5  5
  5 12  5  6  4  5  5  5  5  5  6  4  5  6  5  5  5  4  5  5  5  5  5  4
  5  6  6  5  6  5  5  6  5  5  6  5  5  5  6  4  4  5  5  4  5  5  6  6]
Mean squared error: 1.6805555555555556
Correlation coefficient: 0.45002269860598076
Coefficient of determination (R-squared score, R2 score): 0.09617180205415499
Mean squared error: 1.2814327485380117
Correlation coefficient: 0.46589994996224815
Coefficient of determination (R-squared score, R2 score): 0.10132445839159054

Fixing the reduced class number, I enumerate the original class number to see what a original class number is better.

In [ ]:
# independent data
x = group_satcked_green
# dependent data (labels/targets)
y = np.squeeze(stacked_red)

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max

print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

reduced_class_num = 16
class_num_array = np.arange(reduced_class_num, reduced_class_num * 100 + 1, reduced_class_num)
# class_num_array = np.arange(reduced_class_num * 5, reduced_class_num * 40 + 1, reduced_class_num)
mse_test_list = []
correlation_test_list = []
r_squared_test_list = []
mse_train_list = []
correlation_train_list = []
r_squared_train_list = []

for class_num in class_num_array:
    print('---- ---- ----')
    print(f'class_num = {class_num}')

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)

    # Generate class_num+1 evenly spaced intervals
    intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
    # print(intervals)
    # Digitize the array to get the indices of the intervals
    y_train = np.digitize(y_train, intervals) - 1
    y_test = np.digitize(y_test, intervals) - 1

    # to see unique elements (see if we have all 0, 1,..., class_num-1 classes, better close to all)
    unique_elements = np.unique(y_train)
    # print("Unique elements:", unique_elements)
    print("Number of unique elements:", len(unique_elements))

    model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, multi_class='multinomial')
    fit_result = model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    y_pred_ = model.predict(x_train)

    # Define the boundaries for digitization
    intervals = np.arange(0, class_num+1, class_num/16)
    print(intervals)

    y_train_digital = np.digitize(y_train, intervals) - 1
    y_test_digital = np.digitize(y_test, intervals) - 1
    y_pred_train_digital = np.digitize(y_pred_, intervals) - 1
    y_pred_test_digital = np.digitize(y_pred, intervals) - 1
    # print(y_pred)
    # print(y_pred_test_digital)
    print("test eval:")
    mse = mean_squared_error(y_test_digital, y_pred_test_digital)
    print("Mean squared error:", mse)
    correlation = np.corrcoef(y_pred_test_digital, y_test_digital)[0, 1]
    print("Correlation coefficient:", correlation)
    r_squared = r2_score(y_test_digital, y_pred_test_digital)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)
    mse_test_list.append(mse)
    correlation_test_list.append(correlation)
    r_squared_test_list.append(r_squared)

    print("train eval:")
    mse = mean_squared_error(y_train_digital, y_pred_train_digital)
    print("Mean squared error:", mse)
    correlation = np.corrcoef(y_pred_train_digital, y_train_digital)[0, 1]
    print("Correlation coefficient:", correlation)
    r_squared = r2_score(y_train_digital, y_pred_train_digital)
    print("Coefficient of determination (R-squared score, R2 score):", r_squared)
    mse_train_list.append(mse)
    correlation_train_list.append(correlation)
    r_squared_train_list.append(r_squared)

    plot_comparison(y_test, y_pred, f'Logistic Linear Regression Reduced Evaluation {class_num} to {reduced_class_num}, Test Set')
    plot_comparison(y_train, y_pred_, f'Logistic Linear Regression Reduced Evaluation {class_num} to {reduced_class_num}, Train Set')
x_train shape: (1368, 5)
y_train shape: (1368,)
x_test shape: (72, 5)
y_test shape: (72,)
---- ---- ----
class_num = 16
Number of unique elements: 13
[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.]
test eval:
Mean squared error: 1.5416666666666667
Correlation coefficient: 0.4453030497739079
Coefficient of determination (R-squared score, R2 score): 0.17086834733893552
train eval:
Mean squared error: 1.1608187134502923
Correlation coefficient: 0.44378410042157646
Coefficient of determination (R-squared score, R2 score): 0.1859117170141732
---- ---- ----
class_num = 32
Number of unique elements: 22
[ 0.  2.  4.  6.  8. 10. 12. 14. 16. 18. 20. 22. 24. 26. 28. 30. 32.]
test eval:
Mean squared error: 1.6944444444444444
Correlation coefficient: 0.4582084037679691
Coefficient of determination (R-squared score, R2 score): 0.08870214752567696
train eval:
Mean squared error: 1.1747076023391814
Correlation coefficient: 0.46298174070265335
Coefficient of determination (R-squared score, R2 score): 0.17617136602126982
---- ---- ----
class_num = 48
Number of unique elements: 30
[ 0.  3.  6.  9. 12. 15. 18. 21. 24. 27. 30. 33. 36. 39. 42. 45. 48.]
test eval:
Mean squared error: 1.8611111111111112
Correlation coefficient: 0.43969624320050904
Coefficient of determination (R-squared score, R2 score): -0.0009337068160597539
train eval:
Mean squared error: 1.2478070175438596
Correlation coefficient: 0.4327504506197096
Coefficient of determination (R-squared score, R2 score): 0.12490636079546202
---- ---- ----
class_num = 64
Number of unique elements: 39
[ 0.  4.  8. 12. 16. 20. 24. 28. 32. 36. 40. 44. 48. 52. 56. 60. 64.]
test eval:
Mean squared error: 1.5416666666666667
Correlation coefficient: 0.4427237599869571
Coefficient of determination (R-squared score, R2 score): 0.17086834733893552
train eval:
Mean squared error: 1.1834795321637428
Correlation coefficient: 0.45276394446722484
Coefficient of determination (R-squared score, R2 score): 0.17001956539417284
---- ---- ----
class_num = 80
Number of unique elements: 47
[ 0.  5. 10. 15. 20. 25. 30. 35. 40. 45. 50. 55. 60. 65. 70. 75. 80.]
test eval:
Mean squared error: 1.6527777777777777
Correlation coefficient: 0.4461650275964236
Coefficient of determination (R-squared score, R2 score): 0.11111111111111116
train eval:
Mean squared error: 1.1769005847953216
Correlation coefficient: 0.47426325398766034
Coefficient of determination (R-squared score, R2 score): 0.17463341586449554
---- ---- ----
class_num = 96
Number of unique elements: 53
[ 0.  6. 12. 18. 24. 30. 36. 42. 48. 54. 60. 66. 72. 78. 84. 90. 96.]
test eval:
Mean squared error: 1.5972222222222223
Correlation coefficient: 0.48152746604882574
Coefficient of determination (R-squared score, R2 score): 0.1409897292250234
train eval:
Mean squared error: 1.2149122807017543
Correlation coefficient: 0.45973521922573
Coefficient of determination (R-squared score, R2 score): 0.14797561314707552
---- ---- ----
class_num = 112
Number of unique elements: 61
[  0.   7.  14.  21.  28.  35.  42.  49.  56.  63.  70.  77.  84.  91.
  98. 105. 112.]
test eval:
Mean squared error: 1.4305555555555556
Correlation coefficient: 0.5105003166926159
Coefficient of determination (R-squared score, R2 score): 0.23062558356676
train eval:
Mean squared error: 1.1739766081871346
Correlation coefficient: 0.45996186883524565
Coefficient of determination (R-squared score, R2 score): 0.1766840160735279
---- ---- ----
class_num = 128
Number of unique elements: 68
[  0.   8.  16.  24.  32.  40.  48.  56.  64.  72.  80.  88.  96. 104.
 112. 120. 128.]
test eval:
Mean squared error: 1.4166666666666667
Correlation coefficient: 0.5145106276111256
Coefficient of determination (R-squared score, R2 score): 0.23809523809523814
train eval:
Mean squared error: 1.2185672514619883
Correlation coefficient: 0.4592849257869412
Coefficient of determination (R-squared score, R2 score): 0.14541236288578518
---- ---- ----
class_num = 144
Number of unique elements: 75
[  0.   9.  18.  27.  36.  45.  54.  63.  72.  81.  90.  99. 108. 117.
 126. 135. 144.]
test eval:
Mean squared error: 1.7083333333333333
Correlation coefficient: 0.4878270961758985
Coefficient of determination (R-squared score, R2 score): 0.08123249299719892
train eval:
Mean squared error: 1.2002923976608186
Correlation coefficient: 0.4878416894302014
Coefficient of determination (R-squared score, R2 score): 0.1582286141922371
---- ---- ----
class_num = 160
Number of unique elements: 80
[  0.  10.  20.  30.  40.  50.  60.  70.  80.  90. 100. 110. 120. 130.
 140. 150. 160.]
test eval:
Mean squared error: 1.4027777777777777
Correlation coefficient: 0.5081518272956013
Coefficient of determination (R-squared score, R2 score): 0.24556489262371617
train eval:
Mean squared error: 1.1608187134502923
Correlation coefficient: 0.4853767881800101
Coefficient of determination (R-squared score, R2 score): 0.1859117170141732
---- ---- ----
class_num = 176
Number of unique elements: 85
[  0.  11.  22.  33.  44.  55.  66.  77.  88.  99. 110. 121. 132. 143.
 154. 165. 176.]
test eval:
Mean squared error: 1.5694444444444444
Correlation coefficient: 0.49056003058201525
Coefficient of determination (R-squared score, R2 score): 0.15592903828197946
train eval:
Mean squared error: 1.2485380116959064
Correlation coefficient: 0.47056104507422336
Coefficient of determination (R-squared score, R2 score): 0.12439371074320393
---- ---- ----
class_num = 192
Number of unique elements: 93
[  0.  12.  24.  36.  48.  60.  72.  84.  96. 108. 120. 132. 144. 156.
 168. 180. 192.]
test eval:
Mean squared error: 1.5972222222222223
Correlation coefficient: 0.4795115105550837
Coefficient of determination (R-squared score, R2 score): 0.1409897292250234
train eval:
Mean squared error: 1.1776315789473684
Correlation coefficient: 0.49138666021574107
Coefficient of determination (R-squared score, R2 score): 0.17412076581223745
---- ---- ----
class_num = 208
Number of unique elements: 96
[  0.  13.  26.  39.  52.  65.  78.  91. 104. 117. 130. 143. 156. 169.
 182. 195. 208.]
test eval:
Mean squared error: 1.6527777777777777
Correlation coefficient: 0.46381682852195877
Coefficient of determination (R-squared score, R2 score): 0.11111111111111116
train eval:
Mean squared error: 1.1732456140350878
Correlation coefficient: 0.4915007187113366
Coefficient of determination (R-squared score, R2 score): 0.17719666612578588
---- ---- ----
class_num = 224
Number of unique elements: 99
[  0.  14.  28.  42.  56.  70.  84.  98. 112. 126. 140. 154. 168. 182.
 196. 210. 224.]
test eval:
Mean squared error: 1.6805555555555556
Correlation coefficient: 0.45002269860598076
Coefficient of determination (R-squared score, R2 score): 0.09617180205415499
train eval:
Mean squared error: 1.182748538011696
Correlation coefficient: 0.4664117652507316
Coefficient of determination (R-squared score, R2 score): 0.17053221544643093
---- ---- ----
class_num = 240
Number of unique elements: 107
[  0.  15.  30.  45.  60.  75.  90. 105. 120. 135. 150. 165. 180. 195.
 210. 225. 240.]
test eval:
Mean squared error: 1.6527777777777777
Correlation coefficient: 0.4477622581118343
Coefficient of determination (R-squared score, R2 score): 0.11111111111111116
train eval:
Mean squared error: 1.1805555555555556
Correlation coefficient: 0.4876444824357047
Coefficient of determination (R-squared score, R2 score): 0.1720701656032052
---- ---- ----
class_num = 256
Number of unique elements: 112
[  0.  16.  32.  48.  64.  80.  96. 112. 128. 144. 160. 176. 192. 208.
 224. 240. 256.]
test eval:
Mean squared error: 1.8333333333333333
Correlation coefficient: 0.46074029714410486
Coefficient of determination (R-squared score, R2 score): 0.014005602240896309
train eval:
Mean squared error: 1.182748538011696
Correlation coefficient: 0.5001806407541329
Coefficient of determination (R-squared score, R2 score): 0.17053221544643093
---- ---- ----
class_num = 272
Number of unique elements: 118
[  0.  17.  34.  51.  68.  85. 102. 119. 136. 153. 170. 187. 204. 221.
 238. 255. 272.]
test eval:
Mean squared error: 1.6527777777777777
Correlation coefficient: 0.4826532902908703
Coefficient of determination (R-squared score, R2 score): 0.11111111111111116
train eval:
Mean squared error: 1.2887426900584795
Correlation coefficient: 0.4549296545627164
Coefficient of determination (R-squared score, R2 score): 0.09619795786900975
---- ---- ----
class_num = 288
Number of unique elements: 123
[  0.  18.  36.  54.  72.  90. 108. 126. 144. 162. 180. 198. 216. 234.
 252. 270. 288.]
test eval:
Mean squared error: 1.5138888888888888
Correlation coefficient: 0.4980726241918156
Coefficient of determination (R-squared score, R2 score): 0.1858076563958917
train eval:
Mean squared error: 1.1805555555555556
Correlation coefficient: 0.49649744134825496
Coefficient of determination (R-squared score, R2 score): 0.1720701656032052
---- ---- ----
class_num = 304
Number of unique elements: 128
[  0.  19.  38.  57.  76.  95. 114. 133. 152. 171. 190. 209. 228. 247.
 266. 285. 304.]
test eval:
Mean squared error: 1.7638888888888888
Correlation coefficient: 0.4798922788412493
Coefficient of determination (R-squared score, R2 score): 0.05135387488328669
train eval:
Mean squared error: 1.2485380116959064
Correlation coefficient: 0.47763995189741704
Coefficient of determination (R-squared score, R2 score): 0.12439371074320393
---- ---- ----
class_num = 320
Number of unique elements: 131
[  0.  20.  40.  60.  80. 100. 120. 140. 160. 180. 200. 220. 240. 260.
 280. 300. 320.]
test eval:
Mean squared error: 1.7222222222222223
Correlation coefficient: 0.4578610120916426
Coefficient of determination (R-squared score, R2 score): 0.07376283846872078
train eval:
Mean squared error: 1.246345029239766
Correlation coefficient: 0.46404033191762234
Coefficient of determination (R-squared score, R2 score): 0.1259316608999782
---- ---- ----
class_num = 336
Number of unique elements: 137
[  0.  21.  42.  63.  84. 105. 126. 147. 168. 189. 210. 231. 252. 273.
 294. 315. 336.]
test eval:
Mean squared error: 1.8194444444444444
Correlation coefficient: 0.41491897039022546
Coefficient of determination (R-squared score, R2 score): 0.02147525676937445
train eval:
Mean squared error: 1.2353801169590644
Correlation coefficient: 0.4933076950057393
Coefficient of determination (R-squared score, R2 score): 0.13362141168384933
---- ---- ----
class_num = 352
Number of unique elements: 142
[  0.  22.  44.  66.  88. 110. 132. 154. 176. 198. 220. 242. 264. 286.
 308. 330. 352.]
test eval:
Mean squared error: 1.8611111111111112
Correlation coefficient: 0.46310627542869415
Coefficient of determination (R-squared score, R2 score): -0.0009337068160597539
train eval:
Mean squared error: 1.2521929824561404
Correlation coefficient: 0.4731800075940238
Coefficient of determination (R-squared score, R2 score): 0.1218304604819136
---- ---- ----
class_num = 368
Number of unique elements: 145
[  0.  23.  46.  69.  92. 115. 138. 161. 184. 207. 230. 253. 276. 299.
 322. 345. 368.]
test eval:
Mean squared error: 1.5555555555555556
Correlation coefficient: 0.5017571314715864
Coefficient of determination (R-squared score, R2 score): 0.1633986928104575
train eval:
Mean squared error: 1.222953216374269
Correlation coefficient: 0.4622643321695171
Coefficient of determination (R-squared score, R2 score): 0.14233646257223664
---- ---- ----
class_num = 384
Number of unique elements: 154
[  0.  24.  48.  72.  96. 120. 144. 168. 192. 216. 240. 264. 288. 312.
 336. 360. 384.]
test eval:
Mean squared error: 1.8055555555555556
Correlation coefficient: 0.42511005537204366
Coefficient of determination (R-squared score, R2 score): 0.028944911297852483
train eval:
Mean squared error: 1.263157894736842
Correlation coefficient: 0.4766367538107654
Coefficient of determination (R-squared score, R2 score): 0.11414070969804246
---- ---- ----
class_num = 400
Number of unique elements: 154
[  0.  25.  50.  75. 100. 125. 150. 175. 200. 225. 250. 275. 300. 325.
 350. 375. 400.]
test eval:
Mean squared error: 1.8194444444444444
Correlation coefficient: 0.4818079423309307
Coefficient of determination (R-squared score, R2 score): 0.02147525676937445
train eval:
Mean squared error: 1.2288011695906433
Correlation coefficient: 0.4855109861861982
Coefficient of determination (R-squared score, R2 score): 0.13823526215417203
---- ---- ----
class_num = 416
Number of unique elements: 159
[  0.  26.  52.  78. 104. 130. 156. 182. 208. 234. 260. 286. 312. 338.
 364. 390. 416.]
test eval:
Mean squared error: 1.9166666666666667
Correlation coefficient: 0.44022126465290107
Coefficient of determination (R-squared score, R2 score): -0.03081232492997188
train eval:
Mean squared error: 1.2002923976608186
Correlation coefficient: 0.49459953566695747
Coefficient of determination (R-squared score, R2 score): 0.1582286141922371
---- ---- ----
class_num = 432
Number of unique elements: 165
[  0.  27.  54.  81. 108. 135. 162. 189. 216. 243. 270. 297. 324. 351.
 378. 405. 432.]
test eval:
Mean squared error: 1.6944444444444444
Correlation coefficient: 0.45616291871853465
Coefficient of determination (R-squared score, R2 score): 0.08870214752567696
train eval:
Mean squared error: 1.2383040935672514
Correlation coefficient: 0.48648322507084046
Coefficient of determination (R-squared score, R2 score): 0.13157081147481708
---- ---- ----
class_num = 448
Number of unique elements: 170
[  0.  28.  56.  84. 112. 140. 168. 196. 224. 252. 280. 308. 336. 364.
 392. 420. 448.]
test eval:
Mean squared error: 1.6527777777777777
Correlation coefficient: 0.4645332380852188
Coefficient of determination (R-squared score, R2 score): 0.11111111111111116
train eval:
Mean squared error: 1.2719298245614035
Correlation coefficient: 0.46970184377210306
Coefficient of determination (R-squared score, R2 score): 0.1079889090709455
---- ---- ----
class_num = 464
Number of unique elements: 173
[  0.  29.  58.  87. 116. 145. 174. 203. 232. 261. 290. 319. 348. 377.
 406. 435. 464.]
test eval:
Mean squared error: 1.6944444444444444
Correlation coefficient: 0.4569813782824711
Coefficient of determination (R-squared score, R2 score): 0.08870214752567696
train eval:
Mean squared error: 1.2346491228070176
Correlation coefficient: 0.49020677863046425
Coefficient of determination (R-squared score, R2 score): 0.13413406173610742
---- ---- ----
class_num = 480
Number of unique elements: 180
[  0.  30.  60.  90. 120. 150. 180. 210. 240. 270. 300. 330. 360. 390.
 420. 450. 480.]
test eval:
Mean squared error: 1.6805555555555556
Correlation coefficient: 0.45002269860598076
Coefficient of determination (R-squared score, R2 score): 0.09617180205415499
train eval:
Mean squared error: 1.2814327485380117
Correlation coefficient: 0.46589994996224815
Coefficient of determination (R-squared score, R2 score): 0.10132445839159054
---- ---- ----
class_num = 496
Number of unique elements: 183
[  0.  31.  62.  93. 124. 155. 186. 217. 248. 279. 310. 341. 372. 403.
 434. 465. 496.]
test eval:
Mean squared error: 1.6666666666666667
Correlation coefficient: 0.4638064243926305
Coefficient of determination (R-squared score, R2 score): 0.10364145658263302
train eval:
Mean squared error: 1.2412280701754386
Correlation coefficient: 0.47872261325137877
Coefficient of determination (R-squared score, R2 score): 0.12952021126578472
---- ---- ----
class_num = 512
Number of unique elements: 188
[  0.  32.  64.  96. 128. 160. 192. 224. 256. 288. 320. 352. 384. 416.
 448. 480. 512.]
test eval:
Mean squared error: 1.5972222222222223
Correlation coefficient: 0.543075386641704
Coefficient of determination (R-squared score, R2 score): 0.1409897292250234
train eval:
Mean squared error: 1.2390350877192982
Correlation coefficient: 0.48054044788637845
Coefficient of determination (R-squared score, R2 score): 0.131058161422559
---- ---- ----
class_num = 528
Number of unique elements: 189
[  0.  33.  66.  99. 132. 165. 198. 231. 264. 297. 330. 363. 396. 429.
 462. 495. 528.]
test eval:
Mean squared error: 1.7916666666666667
Correlation coefficient: 0.42732467268306285
Coefficient of determination (R-squared score, R2 score): 0.036414565826330514
train eval:
Mean squared error: 1.2638888888888888
Correlation coefficient: 0.4633726034083941
Coefficient of determination (R-squared score, R2 score): 0.11362805964578437
---- ---- ----
class_num = 544
Number of unique elements: 196
[  0.  34.  68. 102. 136. 170. 204. 238. 272. 306. 340. 374. 408. 442.
 476. 510. 544.]
test eval:
Mean squared error: 1.7916666666666667
Correlation coefficient: 0.43970398735208555
Coefficient of determination (R-squared score, R2 score): 0.036414565826330514
train eval:
Mean squared error: 1.2836257309941521
Correlation coefficient: 0.47881187419459664
Coefficient of determination (R-squared score, R2 score): 0.09978650823481627
---- ---- ----
class_num = 560
Number of unique elements: 198
[  0.  35.  70. 105. 140. 175. 210. 245. 280. 315. 350. 385. 420. 455.
 490. 525. 560.]
test eval:
Mean squared error: 1.9444444444444444
Correlation coefficient: 0.4596242141160763
Coefficient of determination (R-squared score, R2 score): -0.045751633986928164
train eval:
Mean squared error: 1.2858187134502923
Correlation coefficient: 0.4723999346181573
Coefficient of determination (R-squared score, R2 score): 0.098248558078042
---- ---- ----
class_num = 576
Number of unique elements: 205
[  0.  36.  72. 108. 144. 180. 216. 252. 288. 324. 360. 396. 432. 468.
 504. 540. 576.]
test eval:
Mean squared error: 1.7638888888888888
Correlation coefficient: 0.43285661392592695
Coefficient of determination (R-squared score, R2 score): 0.05135387488328669
train eval:
Mean squared error: 1.2894736842105263
Correlation coefficient: 0.4639541511375892
Coefficient of determination (R-squared score, R2 score): 0.09568530781675166
---- ---- ----
class_num = 592
Number of unique elements: 211
[  0.  37.  74. 111. 148. 185. 222. 259. 296. 333. 370. 407. 444. 481.
 518. 555. 592.]
test eval:
Mean squared error: 1.7777777777777777
Correlation coefficient: 0.4348638505605208
Coefficient of determination (R-squared score, R2 score): 0.043884220354808545
train eval:
Mean squared error: 1.2163742690058479
Correlation coefficient: 0.48670768119567076
Coefficient of determination (R-squared score, R2 score): 0.14695031304255934
---- ---- ----
class_num = 608
Number of unique elements: 216
[  0.  38.  76. 114. 152. 190. 228. 266. 304. 342. 380. 418. 456. 494.
 532. 570. 608.]
test eval:
Mean squared error: 1.6805555555555556
Correlation coefficient: 0.4659531086549822
Coefficient of determination (R-squared score, R2 score): 0.09617180205415499
train eval:
Mean squared error: 1.263157894736842
Correlation coefficient: 0.480859721151576
Coefficient of determination (R-squared score, R2 score): 0.11414070969804246
---- ---- ----
class_num = 624
Number of unique elements: 216
[  0.  39.  78. 117. 156. 195. 234. 273. 312. 351. 390. 429. 468. 507.
 546. 585. 624.]
test eval:
Mean squared error: 1.9722222222222223
Correlation coefficient: 0.42114073752888537
Coefficient of determination (R-squared score, R2 score): -0.06069094304388423
train eval:
Mean squared error: 1.2923976608187135
Correlation coefficient: 0.45741126241769076
Coefficient of determination (R-squared score, R2 score): 0.0936347076077193
---- ---- ----
class_num = 640
Number of unique elements: 220
[  0.  40.  80. 120. 160. 200. 240. 280. 320. 360. 400. 440. 480. 520.
 560. 600. 640.]
test eval:
Mean squared error: 2.0277777777777777
Correlation coefficient: 0.36179838793880814
Coefficient of determination (R-squared score, R2 score): -0.09056956115779635
train eval:
Mean squared error: 1.2858187134502923
Correlation coefficient: 0.4628273394227515
Coefficient of determination (R-squared score, R2 score): 0.098248558078042
---- ---- ----
class_num = 656
Number of unique elements: 223
[  0.  41.  82. 123. 164. 205. 246. 287. 328. 369. 410. 451. 492. 533.
 574. 615. 656.]
test eval:
Mean squared error: 1.6527777777777777
Correlation coefficient: 0.4713022353857203
Coefficient of determination (R-squared score, R2 score): 0.11111111111111116
train eval:
Mean squared error: 1.29093567251462
Correlation coefficient: 0.46232912919400926
Coefficient of determination (R-squared score, R2 score): 0.09466000771223548
---- ---- ----
class_num = 672
Number of unique elements: 227
[  0.  42.  84. 126. 168. 210. 252. 294. 336. 378. 420. 462. 504. 546.
 588. 630. 672.]
test eval:
Mean squared error: 1.9305555555555556
Correlation coefficient: 0.45689655658028855
Coefficient of determination (R-squared score, R2 score): -0.03828197945845013
train eval:
Mean squared error: 1.2646198830409356
Correlation coefficient: 0.472065275387751
Coefficient of determination (R-squared score, R2 score): 0.11311540959352628
---- ---- ----
class_num = 688
Number of unique elements: 230
[  0.  43.  86. 129. 172. 215. 258. 301. 344. 387. 430. 473. 516. 559.
 602. 645. 688.]
test eval:
Mean squared error: 1.9027777777777777
Correlation coefficient: 0.39368855021967386
Coefficient of determination (R-squared score, R2 score): -0.023342670401493848
train eval:
Mean squared error: 1.2638888888888888
Correlation coefficient: 0.47872789453046133
Coefficient of determination (R-squared score, R2 score): 0.11362805964578437
---- ---- ----
class_num = 704
Number of unique elements: 239
[  0.  44.  88. 132. 176. 220. 264. 308. 352. 396. 440. 484. 528. 572.
 616. 660. 704.]
test eval:
Mean squared error: 1.7916666666666667
Correlation coefficient: 0.4081133546510662
Coefficient of determination (R-squared score, R2 score): 0.036414565826330514
train eval:
Mean squared error: 1.2295321637426901
Correlation coefficient: 0.47995142043816325
Coefficient of determination (R-squared score, R2 score): 0.13772261210191394
---- ---- ----
class_num = 720
Number of unique elements: 240
[  0.  45.  90. 135. 180. 225. 270. 315. 360. 405. 450. 495. 540. 585.
 630. 675. 720.]
test eval:
Mean squared error: 1.9722222222222223
Correlation coefficient: 0.4358913822419508
Coefficient of determination (R-squared score, R2 score): -0.06069094304388423
train eval:
Mean squared error: 1.4137426900584795
Correlation coefficient: 0.4110594122490662
Coefficient of determination (R-squared score, R2 score): 0.008534798932878451
---- ---- ----
class_num = 736
Number of unique elements: 241
[  0.  46.  92. 138. 184. 230. 276. 322. 368. 414. 460. 506. 552. 598.
 644. 690. 736.]
test eval:
Mean squared error: 1.5416666666666667
Correlation coefficient: 0.5050029530242061
Coefficient of determination (R-squared score, R2 score): 0.17086834733893552
train eval:
Mean squared error: 1.2456140350877194
Correlation coefficient: 0.4687999811551011
Coefficient of determination (R-squared score, R2 score): 0.1264443109522363
---- ---- ----
class_num = 752
Number of unique elements: 249
[  0.  47.  94. 141. 188. 235. 282. 329. 376. 423. 470. 517. 564. 611.
 658. 705. 752.]
test eval:
Mean squared error: 1.8194444444444444
Correlation coefficient: 0.4690525777068861
Coefficient of determination (R-squared score, R2 score): 0.02147525676937445
train eval:
Mean squared error: 1.2719298245614035
Correlation coefficient: 0.4784791515967746
Coefficient of determination (R-squared score, R2 score): 0.1079889090709455
---- ---- ----
class_num = 768
Number of unique elements: 255
[  0.  48.  96. 144. 192. 240. 288. 336. 384. 432. 480. 528. 576. 624.
 672. 720. 768.]
test eval:
Mean squared error: 1.9861111111111112
Correlation coefficient: 0.36310989106098873
Coefficient of determination (R-squared score, R2 score): -0.06816059757236226
train eval:
Mean squared error: 1.3135964912280702
Correlation coefficient: 0.46843563321580156
Coefficient of determination (R-squared score, R2 score): 0.07876785609223513
---- ---- ----
class_num = 784
Number of unique elements: 254
[  0.  49.  98. 147. 196. 245. 294. 343. 392. 441. 490. 539. 588. 637.
 686. 735. 784.]
test eval:
Mean squared error: 1.7916666666666667
Correlation coefficient: 0.43622657198333903
Coefficient of determination (R-squared score, R2 score): 0.036414565826330514
train eval:
Mean squared error: 1.2492690058479532
Correlation coefficient: 0.4683260850364564
Coefficient of determination (R-squared score, R2 score): 0.12388106069094584
---- ---- ----
class_num = 800
Number of unique elements: 254
[  0.  50. 100. 150. 200. 250. 300. 350. 400. 450. 500. 550. 600. 650.
 700. 750. 800.]
test eval:
Mean squared error: 1.8055555555555556
Correlation coefficient: 0.40623732522034717
Coefficient of determination (R-squared score, R2 score): 0.028944911297852483
train eval:
Mean squared error: 1.2638888888888888
Correlation coefficient: 0.4716299005450033
Coefficient of determination (R-squared score, R2 score): 0.11362805964578437
---- ---- ----
class_num = 816
Number of unique elements: 264
[  0.  51. 102. 153. 204. 255. 306. 357. 408. 459. 510. 561. 612. 663.
 714. 765. 816.]
test eval:
Mean squared error: 1.9861111111111112
Correlation coefficient: 0.4238230770001418
Coefficient of determination (R-squared score, R2 score): -0.06816059757236226
train eval:
Mean squared error: 1.3260233918128654
Correlation coefficient: 0.4535445771012145
Coefficient of determination (R-squared score, R2 score): 0.07005280520384782
---- ---- ----
class_num = 832
Number of unique elements: 268
[  0.  52. 104. 156. 208. 260. 312. 364. 416. 468. 520. 572. 624. 676.
 728. 780. 832.]
test eval:
Mean squared error: 1.6666666666666667
Correlation coefficient: 0.4729213450495393
Coefficient of determination (R-squared score, R2 score): 0.10364145658263302
train eval:
Mean squared error: 1.3135964912280702
Correlation coefficient: 0.4649921183679015
Coefficient of determination (R-squared score, R2 score): 0.07876785609223513
---- ---- ----
class_num = 848
Number of unique elements: 268
[  0.  53. 106. 159. 212. 265. 318. 371. 424. 477. 530. 583. 636. 689.
 742. 795. 848.]
test eval:
Mean squared error: 1.7222222222222223
Correlation coefficient: 0.4546882329512439
Coefficient of determination (R-squared score, R2 score): 0.07376283846872078
train eval:
Mean squared error: 1.2616959064327486
Correlation coefficient: 0.4804104142865967
Coefficient of determination (R-squared score, R2 score): 0.11516600980255853
---- ---- ----
class_num = 864
Number of unique elements: 271
[  0.  54. 108. 162. 216. 270. 324. 378. 432. 486. 540. 594. 648. 702.
 756. 810. 864.]
test eval:
Mean squared error: 1.9444444444444444
Correlation coefficient: 0.3688229638711067
Coefficient of determination (R-squared score, R2 score): -0.045751633986928164
train eval:
Mean squared error: 1.2880116959064327
Correlation coefficient: 0.4607349824024031
Coefficient of determination (R-squared score, R2 score): 0.09671060792126784
---- ---- ----
class_num = 880
Number of unique elements: 277
[  0.  55. 110. 165. 220. 275. 330. 385. 440. 495. 550. 605. 660. 715.
 770. 825. 880.]
test eval:
Mean squared error: 1.9027777777777777
Correlation coefficient: 0.38606130431231317
Coefficient of determination (R-squared score, R2 score): -0.023342670401493848
train eval:
Mean squared error: 1.273391812865497
Correlation coefficient: 0.4739358859742344
Coefficient of determination (R-squared score, R2 score): 0.10696360896642931
---- ---- ----
class_num = 896
Number of unique elements: 277
[  0.  56. 112. 168. 224. 280. 336. 392. 448. 504. 560. 616. 672. 728.
 784. 840. 896.]
test eval:
Mean squared error: 1.7638888888888888
Correlation coefficient: 0.4491387425173966
Coefficient of determination (R-squared score, R2 score): 0.05135387488328669
train eval:
Mean squared error: 1.2426900584795322
Correlation coefficient: 0.47145352124873213
Coefficient of determination (R-squared score, R2 score): 0.12849491116126854
---- ---- ----
class_num = 912
Number of unique elements: 284
[  0.  57. 114. 171. 228. 285. 342. 399. 456. 513. 570. 627. 684. 741.
 798. 855. 912.]
test eval:
Mean squared error: 2.0694444444444446
Correlation coefficient: 0.3286713353263606
Coefficient of determination (R-squared score, R2 score): -0.11297852474323067
train eval:
Mean squared error: 1.2931286549707601
Correlation coefficient: 0.45452492527877614
Coefficient of determination (R-squared score, R2 score): 0.09312205755546121
---- ---- ----
class_num = 928
Number of unique elements: 290
[  0.  58. 116. 174. 232. 290. 348. 406. 464. 522. 580. 638. 696. 754.
 812. 870. 928.]
test eval:
Mean squared error: 1.8194444444444444
Correlation coefficient: 0.4141384900804814
Coefficient of determination (R-squared score, R2 score): 0.02147525676937445
train eval:
Mean squared error: 1.202485380116959
Correlation coefficient: 0.5054918407320819
Coefficient of determination (R-squared score, R2 score): 0.15669066403546283
---- ---- ----
class_num = 944
Number of unique elements: 292
[  0.  59. 118. 177. 236. 295. 354. 413. 472. 531. 590. 649. 708. 767.
 826. 885. 944.]
test eval:
Mean squared error: 1.9166666666666667
Correlation coefficient: 0.37831037130187456
Coefficient of determination (R-squared score, R2 score): -0.03081232492997188
train eval:
Mean squared error: 1.2609649122807018
Correlation coefficient: 0.4669473803398881
Coefficient of determination (R-squared score, R2 score): 0.11567865985481662
---- ---- ----
class_num = 960
Number of unique elements: 292
[  0.  60. 120. 180. 240. 300. 360. 420. 480. 540. 600. 660. 720. 780.
 840. 900. 960.]
test eval:
Mean squared error: 1.9583333333333333
Correlation coefficient: 0.37051266633703217
Coefficient of determination (R-squared score, R2 score): -0.053221288515406195
train eval:
Mean squared error: 1.2953216374269005
Correlation coefficient: 0.47384954046268224
Coefficient of determination (R-squared score, R2 score): 0.09158410739868705
---- ---- ----
class_num = 976
Number of unique elements: 300
[  0.  61. 122. 183. 244. 305. 366. 427. 488. 549. 610. 671. 732. 793.
 854. 915. 976.]
test eval:
Mean squared error: 1.7638888888888888
Correlation coefficient: 0.4178303392856154
Coefficient of determination (R-squared score, R2 score): 0.05135387488328669
train eval:
Mean squared error: 1.1900584795321638
Correlation coefficient: 0.49086344589777897
Coefficient of determination (R-squared score, R2 score): 0.16540571492385014
---- ---- ----
class_num = 992
Number of unique elements: 302
[  0.  62. 124. 186. 248. 310. 372. 434. 496. 558. 620. 682. 744. 806.
 868. 930. 992.]
test eval:
Mean squared error: 1.7638888888888888
Correlation coefficient: 0.43093816661664314
Coefficient of determination (R-squared score, R2 score): 0.05135387488328669
train eval:
Mean squared error: 1.2646198830409356
Correlation coefficient: 0.4701594985779692
Coefficient of determination (R-squared score, R2 score): 0.11311540959352628
---- ---- ----
class_num = 1008
Number of unique elements: 306
[   0.   63.  126.  189.  252.  315.  378.  441.  504.  567.  630.  693.
  756.  819.  882.  945. 1008.]
test eval:
Mean squared error: 2.1666666666666665
Correlation coefficient: 0.30371772737300906
Coefficient of determination (R-squared score, R2 score): -0.1652661064425771
train eval:
Mean squared error: 1.3720760233918128
Correlation coefficient: 0.4490251632286824
Coefficient of determination (R-squared score, R2 score): 0.03775585191158892
---- ---- ----
class_num = 1024
Number of unique elements: 304
[   0.   64.  128.  192.  256.  320.  384.  448.  512.  576.  640.  704.
  768.  832.  896.  960. 1024.]
test eval:
Mean squared error: 1.9027777777777777
Correlation coefficient: 0.37797942443125626
Coefficient of determination (R-squared score, R2 score): -0.023342670401493848
train eval:
Mean squared error: 1.2997076023391814
Correlation coefficient: 0.45309109016089966
Coefficient of determination (R-squared score, R2 score): 0.08850820708513851
---- ---- ----
class_num = 1040
Number of unique elements: 312
[   0.   65.  130.  195.  260.  325.  390.  455.  520.  585.  650.  715.
  780.  845.  910.  975. 1040.]
test eval:
Mean squared error: 1.9305555555555556
Correlation coefficient: 0.3796621809376958
Coefficient of determination (R-squared score, R2 score): -0.03828197945845013
train eval:
Mean squared error: 1.307748538011696
Correlation coefficient: 0.4837275705321927
Coefficient of determination (R-squared score, R2 score): 0.08286905651029974
---- ---- ----
class_num = 1056
Number of unique elements: 316
[   0.   66.  132.  198.  264.  330.  396.  462.  528.  594.  660.  726.
  792.  858.  924.  990. 1056.]
test eval:
Mean squared error: 2.0555555555555554
Correlation coefficient: 0.34656448124174166
Coefficient of determination (R-squared score, R2 score): -0.10550887021475264
train eval:
Mean squared error: 1.2916666666666667
Correlation coefficient: 0.4608112848899926
Coefficient of determination (R-squared score, R2 score): 0.0941473576599774
---- ---- ----
class_num = 1072
Number of unique elements: 320
[   0.   67.  134.  201.  268.  335.  402.  469.  536.  603.  670.  737.
  804.  871.  938. 1005. 1072.]
test eval:
Mean squared error: 1.7361111111111112
Correlation coefficient: 0.4252154709685706
Coefficient of determination (R-squared score, R2 score): 0.06629318394024275
train eval:
Mean squared error: 1.1966374269005848
Correlation coefficient: 0.49824566710039364
Coefficient of determination (R-squared score, R2 score): 0.16079186445352744
---- ---- ----
class_num = 1088
Number of unique elements: 318
[   0.   68.  136.  204.  272.  340.  408.  476.  544.  612.  680.  748.
  816.  884.  952. 1020. 1088.]
test eval:
Mean squared error: 1.7777777777777777
Correlation coefficient: 0.4045666488217864
Coefficient of determination (R-squared score, R2 score): 0.043884220354808545
train eval:
Mean squared error: 1.2580409356725146
Correlation coefficient: 0.475614789140867
Coefficient of determination (R-squared score, R2 score): 0.11772926006384898
---- ---- ----
class_num = 1104
Number of unique elements: 323
[   0.   69.  138.  207.  276.  345.  414.  483.  552.  621.  690.  759.
  828.  897.  966. 1035. 1104.]
test eval:
Mean squared error: 1.8611111111111112
Correlation coefficient: 0.4029068861759726
Coefficient of determination (R-squared score, R2 score): -0.0009337068160597539
train eval:
Mean squared error: 1.293859649122807
Correlation coefficient: 0.4594385999256325
Coefficient of determination (R-squared score, R2 score): 0.09260940750320323
---- ---- ----
class_num = 1120
Number of unique elements: 325
[   0.   70.  140.  210.  280.  350.  420.  490.  560.  630.  700.  770.
  840.  910.  980. 1050. 1120.]
test eval:
Mean squared error: 1.8611111111111112
Correlation coefficient: 0.38371943142152887
Coefficient of determination (R-squared score, R2 score): -0.0009337068160597539
train eval:
Mean squared error: 1.2426900584795322
Correlation coefficient: 0.48807596539487264
Coefficient of determination (R-squared score, R2 score): 0.12849491116126854
---- ---- ----
class_num = 1136
Number of unique elements: 331
[   0.   71.  142.  213.  284.  355.  426.  497.  568.  639.  710.  781.
  852.  923.  994. 1065. 1136.]
test eval:
Mean squared error: 1.6944444444444444
Correlation coefficient: 0.4517964129116188
Coefficient of determination (R-squared score, R2 score): 0.08870214752567696
train eval:
Mean squared error: 1.2141812865497077
Correlation coefficient: 0.49809781384805735
Coefficient of determination (R-squared score, R2 score): 0.14848826319933361
---- ---- ----
class_num = 1152
Number of unique elements: 339
[   0.   72.  144.  216.  288.  360.  432.  504.  576.  648.  720.  792.
  864.  936. 1008. 1080. 1152.]
test eval:
Mean squared error: 2.0
Correlation coefficient: 0.35483220545239
Coefficient of determination (R-squared score, R2 score): -0.07563025210084029
train eval:
Mean squared error: 1.3625730994152048
Correlation coefficient: 0.45486328401539844
Coefficient of determination (R-squared score, R2 score): 0.04442030259094387
---- ---- ----
class_num = 1168
Number of unique elements: 336
[   0.   73.  146.  219.  292.  365.  438.  511.  584.  657.  730.  803.
  876.  949. 1022. 1095. 1168.]
test eval:
Mean squared error: 2.0972222222222223
Correlation coefficient: 0.34319982741955934
Coefficient of determination (R-squared score, R2 score): -0.12791783380018673
train eval:
Mean squared error: 1.2792397660818713
Correlation coefficient: 0.4745832664711267
Coefficient of determination (R-squared score, R2 score): 0.1028624085483647
---- ---- ----
class_num = 1184
Number of unique elements: 342
[   0.   74.  148.  222.  296.  370.  444.  518.  592.  666.  740.  814.
  888.  962. 1036. 1110. 1184.]
test eval:
Mean squared error: 1.8333333333333333
Correlation coefficient: 0.4137347989594087
Coefficient of determination (R-squared score, R2 score): 0.014005602240896309
train eval:
Mean squared error: 1.243421052631579
Correlation coefficient: 0.485483682213542
Coefficient of determination (R-squared score, R2 score): 0.12798226110901056
---- ---- ----
class_num = 1200
Number of unique elements: 346
[   0.   75.  150.  225.  300.  375.  450.  525.  600.  675.  750.  825.
  900.  975. 1050. 1125. 1200.]
test eval:
Mean squared error: 1.9027777777777777
Correlation coefficient: 0.3856102534693767
Coefficient of determination (R-squared score, R2 score): -0.023342670401493848
train eval:
Mean squared error: 1.2704678362573099
Correlation coefficient: 0.4891116040869614
Coefficient of determination (R-squared score, R2 score): 0.10901420917546167
---- ---- ----
class_num = 1216
Number of unique elements: 347
[   0.   76.  152.  228.  304.  380.  456.  532.  608.  684.  760.  836.
  912.  988. 1064. 1140. 1216.]
test eval:
Mean squared error: 1.9166666666666667
Correlation coefficient: 0.37518576343555965
Coefficient of determination (R-squared score, R2 score): -0.03081232492997188
train eval:
Mean squared error: 1.1944444444444444
Correlation coefficient: 0.5053984255768346
Coefficient of determination (R-squared score, R2 score): 0.16232981461030171
---- ---- ----
class_num = 1232
Number of unique elements: 347
[   0.   77.  154.  231.  308.  385.  462.  539.  616.  693.  770.  847.
  924. 1001. 1078. 1155. 1232.]
test eval:
Mean squared error: 1.7777777777777777
Correlation coefficient: 0.4084362837832298
Coefficient of determination (R-squared score, R2 score): 0.043884220354808545
train eval:
Mean squared error: 1.2295321637426901
Correlation coefficient: 0.48753829537709903
Coefficient of determination (R-squared score, R2 score): 0.13772261210191394
---- ---- ----
class_num = 1248
Number of unique elements: 352
[   0.   78.  156.  234.  312.  390.  468.  546.  624.  702.  780.  858.
  936. 1014. 1092. 1170. 1248.]
test eval:
Mean squared error: 2.013888888888889
Correlation coefficient: 0.3399437590796833
Coefficient of determination (R-squared score, R2 score): -0.08309990662931832
train eval:
Mean squared error: 1.2448830409356726
Correlation coefficient: 0.477891117240075
Coefficient of determination (R-squared score, R2 score): 0.12695696100449438
---- ---- ----
class_num = 1264
Number of unique elements: 362
[   0.   79.  158.  237.  316.  395.  474.  553.  632.  711.  790.  869.
  948. 1027. 1106. 1185. 1264.]
test eval:
Mean squared error: 1.9166666666666667
Correlation coefficient: 0.383207539724106
Coefficient of determination (R-squared score, R2 score): -0.03081232492997188
train eval:
Mean squared error: 1.2251461988304093
Correlation coefficient: 0.4939207915172938
Coefficient of determination (R-squared score, R2 score): 0.14079851241546248
---- ---- ----
class_num = 1280
Number of unique elements: 356
[   0.   80.  160.  240.  320.  400.  480.  560.  640.  720.  800.  880.
  960. 1040. 1120. 1200. 1280.]
test eval:
Mean squared error: 1.7361111111111112
Correlation coefficient: 0.4474038384503889
Coefficient of determination (R-squared score, R2 score): 0.06629318394024275
train eval:
Mean squared error: 1.3596491228070176
Correlation coefficient: 0.4520649394372574
Coefficient of determination (R-squared score, R2 score): 0.04647090279997623
---- ---- ----
class_num = 1296
Number of unique elements: 366
[   0.   81.  162.  243.  324.  405.  486.  567.  648.  729.  810.  891.
  972. 1053. 1134. 1215. 1296.]
test eval:
Mean squared error: 1.9722222222222223
Correlation coefficient: 0.3706095612956417
Coefficient of determination (R-squared score, R2 score): -0.06069094304388423
train eval:
Mean squared error: 1.2675438596491229
Correlation coefficient: 0.47292463300046395
Coefficient of determination (R-squared score, R2 score): 0.11106480938449392
---- ---- ----
class_num = 1312
Number of unique elements: 368
[   0.   82.  164.  246.  328.  410.  492.  574.  656.  738.  820.  902.
  984. 1066. 1148. 1230. 1312.]
test eval:
Mean squared error: 1.7777777777777777
Correlation coefficient: 0.40456664882178633
Coefficient of determination (R-squared score, R2 score): 0.043884220354808545
train eval:
Mean squared error: 1.2295321637426901
Correlation coefficient: 0.49208731415051526
Coefficient of determination (R-squared score, R2 score): 0.13772261210191394
---- ---- ----
class_num = 1328
Number of unique elements: 370
[   0.   83.  166.  249.  332.  415.  498.  581.  664.  747.  830.  913.
  996. 1079. 1162. 1245. 1328.]
test eval:
Mean squared error: 1.9444444444444444
Correlation coefficient: 0.36176557943869
Coefficient of determination (R-squared score, R2 score): -0.045751633986928164
train eval:
Mean squared error: 1.2887426900584795
Correlation coefficient: 0.45964704856357114
Coefficient of determination (R-squared score, R2 score): 0.09619795786900975
---- ---- ----
class_num = 1344
Number of unique elements: 372
[   0.   84.  168.  252.  336.  420.  504.  588.  672.  756.  840.  924.
 1008. 1092. 1176. 1260. 1344.]
test eval:
Mean squared error: 1.875
Correlation coefficient: 0.38099053375714387
Coefficient of determination (R-squared score, R2 score): -0.008403361344537785
train eval:
Mean squared error: 1.182748538011696
Correlation coefficient: 0.5003685087544218
Coefficient of determination (R-squared score, R2 score): 0.17053221544643093
---- ---- ----
class_num = 1360
Number of unique elements: 371
[   0.   85.  170.  255.  340.  425.  510.  595.  680.  765.  850.  935.
 1020. 1105. 1190. 1275. 1360.]
test eval:
Mean squared error: 1.7083333333333333
Correlation coefficient: 0.435047042551659
Coefficient of determination (R-squared score, R2 score): 0.08123249299719892
train eval:
Mean squared error: 1.2682748538011697
Correlation coefficient: 0.4664945248439736
Coefficient of determination (R-squared score, R2 score): 0.11055215933223583
---- ---- ----
class_num = 1376
Number of unique elements: 377
[   0.   86.  172.  258.  344.  430.  516.  602.  688.  774.  860.  946.
 1032. 1118. 1204. 1290. 1376.]
test eval:
Mean squared error: 1.8611111111111112
Correlation coefficient: 0.413434161766323
Coefficient of determination (R-squared score, R2 score): -0.0009337068160597539
train eval:
Mean squared error: 1.236842105263158
Correlation coefficient: 0.49104931350658143
Coefficient of determination (R-squared score, R2 score): 0.13259611157933315
---- ---- ----
class_num = 1392
Number of unique elements: 383
[   0.   87.  174.  261.  348.  435.  522.  609.  696.  783.  870.  957.
 1044. 1131. 1218. 1305. 1392.]
test eval:
Mean squared error: 2.0972222222222223
Correlation coefficient: 0.33083959438263544
Coefficient of determination (R-squared score, R2 score): -0.12791783380018673
train eval:
Mean squared error: 1.226608187134503
Correlation coefficient: 0.5058904561511268
Coefficient of determination (R-squared score, R2 score): 0.1397732123109463
---- ---- ----
class_num = 1408
Number of unique elements: 382
[   0.   88.  176.  264.  352.  440.  528.  616.  704.  792.  880.  968.
 1056. 1144. 1232. 1320. 1408.]
test eval:
Mean squared error: 2.0833333333333335
Correlation coefficient: 0.34456362234832855
Coefficient of determination (R-squared score, R2 score): -0.1204481792717087
train eval:
Mean squared error: 1.3004385964912282
Correlation coefficient: 0.4601800591879395
Coefficient of determination (R-squared score, R2 score): 0.08799555703288053
---- ---- ----
class_num = 1424
Number of unique elements: 386
[   0.   89.  178.  267.  356.  445.  534.  623.  712.  801.  890.  979.
 1068. 1157. 1246. 1335. 1424.]
test eval:
Mean squared error: 1.7916666666666667
Correlation coefficient: 0.41574004869527287
Coefficient of determination (R-squared score, R2 score): 0.036414565826330514
train eval:
Mean squared error: 1.1710526315789473
Correlation coefficient: 0.5141586856048364
Coefficient of determination (R-squared score, R2 score): 0.17873461628256015
---- ---- ----
class_num = 1440
Number of unique elements: 396
[   0.   90.  180.  270.  360.  450.  540.  630.  720.  810.  900.  990.
 1080. 1170. 1260. 1350. 1440.]
test eval:
Mean squared error: 1.7777777777777777
Correlation coefficient: 0.41325664650877453
Coefficient of determination (R-squared score, R2 score): 0.043884220354808545
train eval:
Mean squared error: 1.185672514619883
Correlation coefficient: 0.4902188114305292
Coefficient of determination (R-squared score, R2 score): 0.16848161523739857
---- ---- ----
class_num = 1456
Number of unique elements: 392
[   0.   91.  182.  273.  364.  455.  546.  637.  728.  819.  910. 1001.
 1092. 1183. 1274. 1365. 1456.]
test eval:
Mean squared error: 1.8472222222222223
Correlation coefficient: 0.3922657747207214
Coefficient of determination (R-squared score, R2 score): 0.006535947712418277
train eval:
Mean squared error: 1.2076023391812865
Correlation coefficient: 0.4945476084131888
Coefficient of determination (R-squared score, R2 score): 0.15310211366965631
---- ---- ----
class_num = 1472
Number of unique elements: 396
[   0.   92.  184.  276.  368.  460.  552.  644.  736.  828.  920. 1012.
 1104. 1196. 1288. 1380. 1472.]
test eval:
Mean squared error: 1.9027777777777777
Correlation coefficient: 0.39368855021967386
Coefficient of determination (R-squared score, R2 score): -0.023342670401493848
train eval:
Mean squared error: 1.2485380116959064
Correlation coefficient: 0.5032400900486917
Coefficient of determination (R-squared score, R2 score): 0.12439371074320393
---- ---- ----
class_num = 1488
Number of unique elements: 402
[   0.   93.  186.  279.  372.  465.  558.  651.  744.  837.  930. 1023.
 1116. 1209. 1302. 1395. 1488.]
test eval:
Mean squared error: 1.875
Correlation coefficient: 0.41048083566793636
Coefficient of determination (R-squared score, R2 score): -0.008403361344537785
train eval:
Mean squared error: 1.2580409356725146
Correlation coefficient: 0.48405756700865465
Coefficient of determination (R-squared score, R2 score): 0.11772926006384898
---- ---- ----
class_num = 1504
Number of unique elements: 404
[   0.   94.  188.  282.  376.  470.  564.  658.  752.  846.  940. 1034.
 1128. 1222. 1316. 1410. 1504.]
test eval:
Mean squared error: 1.9027777777777777
Correlation coefficient: 0.39678485708741124
Coefficient of determination (R-squared score, R2 score): -0.023342670401493848
train eval:
Mean squared error: 1.273391812865497
Correlation coefficient: 0.4896315247523248
Coefficient of determination (R-squared score, R2 score): 0.10696360896642931
---- ---- ----
class_num = 1520
Number of unique elements: 404
[   0.   95.  190.  285.  380.  475.  570.  665.  760.  855.  950. 1045.
 1140. 1235. 1330. 1425. 1520.]
test eval:
Mean squared error: 1.7916666666666667
Correlation coefficient: 0.40697222497305247
Coefficient of determination (R-squared score, R2 score): 0.036414565826330514
train eval:
Mean squared error: 1.1973684210526316
Correlation coefficient: 0.4913412954857841
Coefficient of determination (R-squared score, R2 score): 0.16027921440126935
---- ---- ----
class_num = 1536
Number of unique elements: 408
[   0.   96.  192.  288.  384.  480.  576.  672.  768.  864.  960. 1056.
 1152. 1248. 1344. 1440. 1536.]
test eval:
Mean squared error: 2.2083333333333335
Correlation coefficient: 0.30649383154711757
Coefficient of determination (R-squared score, R2 score): -0.1876750700280112
train eval:
Mean squared error: 1.3245614035087718
Correlation coefficient: 0.47964636775154507
Coefficient of determination (R-squared score, R2 score): 0.07107810530836389
---- ---- ----
class_num = 1552
Number of unique elements: 405
[   0.   97.  194.  291.  388.  485.  582.  679.  776.  873.  970. 1067.
 1164. 1261. 1358. 1455. 1552.]
test eval:
Mean squared error: 1.7083333333333333
Correlation coefficient: 0.4414361692952945
Coefficient of determination (R-squared score, R2 score): 0.08123249299719892
train eval:
Mean squared error: 1.2485380116959064
Correlation coefficient: 0.488534227346123
Coefficient of determination (R-squared score, R2 score): 0.12439371074320393
---- ---- ----
class_num = 1568
Number of unique elements: 413
[   0.   98.  196.  294.  392.  490.  588.  686.  784.  882.  980. 1078.
 1176. 1274. 1372. 1470. 1568.]
test eval:
Mean squared error: 1.9305555555555556
Correlation coefficient: 0.3582330363535253
Coefficient of determination (R-squared score, R2 score): -0.03828197945845013
train eval:
Mean squared error: 1.161549707602339
Correlation coefficient: 0.506796995884191
Coefficient of determination (R-squared score, R2 score): 0.18539906696191522
---- ---- ----
class_num = 1584
Number of unique elements: 417
[   0.   99.  198.  297.  396.  495.  594.  693.  792.  891.  990. 1089.
 1188. 1287. 1386. 1485. 1584.]
test eval:
Mean squared error: 1.9722222222222223
Correlation coefficient: 0.38708808512121956
Coefficient of determination (R-squared score, R2 score): -0.06069094304388423
train eval:
Mean squared error: 1.2551169590643274
Correlation coefficient: 0.4768290483095451
Coefficient of determination (R-squared score, R2 score): 0.11977986027288123
---- ---- ----
class_num = 1600
Number of unique elements: 415
[   0.  100.  200.  300.  400.  500.  600.  700.  800.  900. 1000. 1100.
 1200. 1300. 1400. 1500. 1600.]
test eval:
Mean squared error: 2.013888888888889
Correlation coefficient: 0.36854770096451805
Coefficient of determination (R-squared score, R2 score): -0.08309990662931832
train eval:
Mean squared error: 1.243421052631579
Correlation coefficient: 0.49224819113644336
Coefficient of determination (R-squared score, R2 score): 0.12798226110901056
In [ ]:
# plot the trend figures of mse, correlation, and r2

# Create a figure and subplots
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(8, 12))

# Plot MSE
ax1.plot(class_num_array, mse_test_list, label='MSE (Test)')
ax1.plot(class_num_array, mse_train_list, label='MSE (Train)')
ax1.set_ylabel('MSE')
ax1.set_xlabel('Original Class Number')
ax1.set_title(f'MSE Curve (reduced class num is {reduced_class_num})')
ax1.legend()

# Plot Correlation
ax2.plot(class_num_array, correlation_test_list, label='Correlation (Test)')
ax2.plot(class_num_array, correlation_train_list, label='Correlation (Train)')
ax2.set_ylabel('Correlation')
ax2.set_xlabel('Original Class Number')
ax2.set_title(f'Correlation Curve (reduced class num is {reduced_class_num})')
ax2.legend()

# Plot R-squared
ax3.plot(class_num_array, r_squared_test_list, label='R-squared (Test)')
ax3.plot(class_num_array, r_squared_train_list, label='R-squared (Train)')
ax3.set_ylabel('R-squared')
ax3.set_xlabel('Original Class Number')
ax3.set_title(f'R-squared Curve (reduced class num is {reduced_class_num})')
ax3.legend()

# Increase the vertical spacing between subplots
plt.subplots_adjust(hspace=0.5)

# Adjust tick, label, title, and legend font sizes
plt.rcParams.update({'font.size': 12})
ax1.tick_params(labelsize=10)
ax2.tick_params(labelsize=10)
ax3.tick_params(labelsize=10)

# Save the figure
plt.savefig(f'mse_correlation_r2_trend_curve_reduced_eval_reduced_class_num_{reduced_class_num}.png', bbox_inches='tight')

# Show the figure
plt.show()

# Convert r_squared_test_list to a NumPy array
r_squared_test_array = np.array(r_squared_test_list)
# Find the index of the maximum value
max_index = np.argmax(r_squared_test_array)
# Get the corresponding class_num value
max_class_num = class_num_array[max_index]

# Print the index and corresponding class_num
print("Max Index:", max_index)
print("Max Original Class Num:", max_class_num)
Max Index: 9
Max Original Class Num: 160

Fit and predict (balanced weights)¶

balanced weights don't improve the fitting, but make it worse.

In [ ]:
# independent data
x = group_satcked_green

class_num = 48

# dependent data (labels/targets)
y = np.squeeze(stacked_red)
# print(np.max(y), np.min(y))

# Split the data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.05, random_state=42)
# Calculate the minimum and maximum values
infinitesimal = np.finfo(float).eps
min_val = np.min(y) - infinitesimal # to guarantee to include min
max_val = np.max(y) + infinitesimal # to guarantee to include max
# Generate class_num+1 evenly spaced intervals
intervals = np.linspace(min_val, max_val, num=class_num+1) # num = class num + 1
# print(intervals)
# Digitize the array to get the indices of the intervals
y_train = np.digitize(y_train, intervals) - 1
y_test = np.digitize(y_test, intervals) - 1
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

# to see unique elements (see if we have all 0, 1,..., class_num-1 classes, better close to all)
unique_elements = np.unique(y_train)
print("Unique elements:", unique_elements)
print("Number of unique elements:", len(unique_elements))
x_train shape: (1368, 5)
y_train shape: (1368,)
x_test shape: (72, 5)
y_test shape: (72,)
Unique elements: [ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 29 30
 33 34 35 37 42 47]
Number of unique elements: 30
In [ ]:
# fit
model = linear_model.LogisticRegression(fit_intercept=True, max_iter=1000, class_weight='balanced', multi_class='multinomial')
fit_result = model.fit(x_train, y_train)
print(fit_result.intercept_.shape, fit_result.coef_.shape)

# predict
# Use the trained model to make predictions
y_pred = model.predict(x_test)
# Alternatively, you can get the predicted probabilities for each class
y_prob = model.predict_proba(x_test)
# Print the predicted class labels
print(y_pred, y_test)
print(y_pred.shape, y_test.shape)
# Print the predicted probabilities
# print(y_prob)
(30,) (30, 5)
[ 7  7 11  9  8 19 29 21 14 20 14 11 35 13 11 26 13 15 20  9 22 14  8 20
  8 37  9 25 29  8 17 21  8 15 19 10 19 26  8 19  9  8 29 26 25 11 14 13
  8 27 26 15 25 18  8 14 13 14 14 20 20 13 14  7  9 20 22 13  8 20 29 22] [15 18 16 12 15 13 19 18 16 18 16 15 26 17 18 15 12 17 25  0 17 17 15 13
 14 26 18 16 13 13 16 17 14 15 13 12 14 18 17 16 18 12 17 30 19 15 15 18
 15 27 14 17 17 16 17 15 15 14 16 17 21 14 16 17 15 19 13  7 20 17 16 18]
(72,) (72,)
/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(

Evaluate (balanced weights)¶

In [ ]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# cm = confusion_matrix(y_test, y_pred)
# print("Confusion Matrix:")
# print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#           |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred, y_test)[0, 1]
print("Correlation coefficient:", correlation)


plot_comparison(y_test, y_pred, 'Logistic Linear Regression balanced weights, Test Set')
Accuracy: 0.041666666666666664
Correlation coefficient: 0.4563969713481123
In [ ]:
# predict on train
# Use the trained model to make predictions
y_pred_ = model.predict(x_train)
# Alternatively, you can get the predicted probabilities for each class
y_prob_ = model.predict_proba(x_train)

accuracy = accuracy_score(y_train, y_pred_)
print("Accuracy:", accuracy)

# cm = confusion_matrix(y_train, y_pred_)
# print("Confusion Matrix:")
# print(cm)
# the columns represent the predicted labels (predictions)
# the rows represent the true labels (ground truth)
#                Predicted Class
#              |   Class 1   |   Class 2   |   Class 3   |
# -----------------------------------------------------
# True Class   |     TP1     |     FN1     |     FN1     |
# -----------------------------------------------------
# True Class   |     FP2     |     TP2     |     FN2     |
# -----------------------------------------------------
# True Class   |     FN3     |     FP3     |     TP3     |

# Calculate the correlation coefficient
correlation = np.corrcoef(y_pred_, y_train)[0, 1]
print("Correlation coefficient:", correlation)


plot_comparison(y_train, y_pred_, 'Logistic Linear Regression balanced weights, Train Set')
Accuracy: 0.059941520467836254
Correlation coefficient: 0.38806679746981715

Batch download files¶

In [ ]:
# batch download the plotted figures
# uncomment the code below to download figures if needed

'''
import glob

folder_path = '.'
# file_prefix = 'Comparison (Logistic Linear Regression Reduced Evaluation'
file_prefix = 'Comparison'

# Use glob to find all files with the given prefix in the folder
matching_files = glob.glob(f"{folder_path}/{file_prefix}*")
# print(matching_files)
# # Print the matching file names
# for file_path in matching_files:
#     print(file_path)

import zipfile

zip_filename = 'files.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    # Add files to the zip file
    for file_path in matching_files:
        zipf.write(file_path)

from google.colab import files
files.download(zip_filename)
'''

Delete generated files¶

Use the code cautiously.

In [ ]:
# # Specify the path to the root folder
# root_folder = '/content'

# # Get a list of all files in the root folder
# files = os.listdir(root_folder)

# files_to_delete = [file for file in files if file.endswith(".png")]

# for file_ in files_to_delete:
#     print(file_)

# # Iterate over the files and delete them
# for file in files_to_delete:
#     file_path = os.path.join(root_folder, file)
#     if os.path.isfile(file_path):
#         os.remove(file_path)